"""Diagnostic tool for LaTeX rendering issues. Usage: python diagnose_latex_rendering.py "\\lambda + \\vdots" python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$" """ import sys import re from typing import Dict, Any # Simulate the OCR postprocessing pipeline _COMMANDS_NEED_SPACE = { "cdot", "times", "div", "pm", "mp", "int", "iint", "iiint", "oint", "sum", "prod", "lim", "sin", "cos", "tan", "cot", "sec", "csc", "log", "ln", "exp", "partial", "nabla", } _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+") _DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(? str: """Split OCR-glued LaTeX command token by whitelist longest-prefix.""" if not token.startswith("\\"): return token body = token[1:] if len(body) < 2: return token best = None for i in range(1, len(body)): prefix = body[:i] if prefix in _COMMANDS_NEED_SPACE: best = prefix if not best: return token suffix = body[len(best) :] if not suffix: return token return f"\\{best} {suffix}" def _fix_ocr_number_errors(expr: str) -> str: """Fix common OCR errors in LaTeX math expressions.""" expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr) expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr) expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr) expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr) return expr def _postprocess_math(expr: str) -> str: """Postprocess a *math* expression (already inside $...$ or $$...$$).""" original = expr # Stage 0: fix OCR number errors expr = _fix_ocr_number_errors(expr) stage0 = expr # Stage 1: split glued command tokens expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) stage1 = expr # Stage 2: normalize differentials expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr) expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr) stage2 = expr return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr} def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]: """Apply LaTeX postprocessing to markdown segments.""" if not markdown_content: return {"original": markdown_content, "final": markdown_content, "segments": []} segments = [] def _fix_segment(m: re.Match) -> str: seg = m.group(0) inner = None if seg.startswith("$$") and seg.endswith("$$"): inner = seg[2:-2] result = _postprocess_math(inner) segments.append({"type": "display", "original": seg, "processing": result}) return f"$${result['final']}$$" elif seg.startswith("$") and seg.endswith("$"): inner = seg[1:-1] result = _postprocess_math(inner) segments.append({"type": "inline", "original": seg, "processing": result}) return f"${result['final']}$" return seg final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final} def diagnose(latex_input: str) -> None: """Run diagnostic on LaTeX input.""" print("=" * 80) print("LaTeX Rendering Diagnostic Tool") print("=" * 80) print(f"\nInput: {latex_input}") print("-" * 80) # Check if input contains problematic characters print("\n1. Character Detection:") if "\\lambda" in latex_input: print(" ✅ Found \\lambda") if "\\vdots" in latex_input: print(" ✅ Found \\vdots") if "\\cdots" in latex_input: print(" ℹ️ Found \\cdots (similar to \\vdots)") if "\\ldots" in latex_input: print(" ℹ️ Found \\ldots (similar to \\vdots)") # Run postprocessing pipeline print("\n2. Postprocessing Pipeline:") result = _postprocess_markdown(latex_input) if result["segments"]: for i, seg in enumerate(result["segments"], 1): print(f"\n Segment {i} ({seg['type']}):") print(f" Original: {seg['original']}") proc = seg["processing"] # Check each stage for changes if proc["original"] != proc["after_stage0_numbers"]: print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}") else: print(f" ✅ Stage 0 (numbers): No change") if proc["after_stage0_numbers"] != proc["after_stage1_commands"]: print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}") else: print(f" ✅ Stage 1 (commands): No change") if proc["after_stage1_commands"] != proc["after_stage2_differentials"]: print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}") else: print(f" ✅ Stage 2 (differentials): No change") print(f" Final: {proc['final']}") else: print(" ℹ️ No math segments found (not wrapped in $ or $$)") print("\n3. Final Output:") print(f" {result['final']}") if result["changed"]: print("\n ⚠️ WARNING: The input was modified during postprocessing!") print(" This could be the cause of rendering issues.") else: print("\n ✅ No changes made during postprocessing.") print(" If rendering fails, the issue is likely in:") print(" - Pandoc conversion (LaTeX → MathML)") print(" - Frontend rendering (MathJax/KaTeX)") print("\n" + "=" * 80) if __name__ == "__main__": if len(sys.argv) < 2: print('Usage: python diagnose_latex_rendering.py ""') print("\nExamples:") print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"') print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"') sys.exit(1) latex_input = sys.argv[1] diagnose(latex_input)