203 lines
6.3 KiB
Python
203 lines
6.3 KiB
Python
|
|
"""Diagnostic tool for LaTeX rendering issues.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python diagnose_latex_rendering.py "\\lambda + \\vdots"
|
|||
|
|
python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$"
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import sys
|
|||
|
|
import re
|
|||
|
|
from typing import Dict, Any
|
|||
|
|
|
|||
|
|
# Simulate the OCR postprocessing pipeline
|
|||
|
|
_COMMANDS_NEED_SPACE = {
|
|||
|
|
"cdot",
|
|||
|
|
"times",
|
|||
|
|
"div",
|
|||
|
|
"pm",
|
|||
|
|
"mp",
|
|||
|
|
"int",
|
|||
|
|
"iint",
|
|||
|
|
"iiint",
|
|||
|
|
"oint",
|
|||
|
|
"sum",
|
|||
|
|
"prod",
|
|||
|
|
"lim",
|
|||
|
|
"sin",
|
|||
|
|
"cos",
|
|||
|
|
"tan",
|
|||
|
|
"cot",
|
|||
|
|
"sec",
|
|||
|
|
"csc",
|
|||
|
|
"log",
|
|||
|
|
"ln",
|
|||
|
|
"exp",
|
|||
|
|
"partial",
|
|||
|
|
"nabla",
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
|||
|
|
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
|
|||
|
|
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
|
|||
|
|
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _split_glued_command_token(token: str) -> str:
|
|||
|
|
"""Split OCR-glued LaTeX command token by whitelist longest-prefix."""
|
|||
|
|
if not token.startswith("\\"):
|
|||
|
|
return token
|
|||
|
|
|
|||
|
|
body = token[1:]
|
|||
|
|
if len(body) < 2:
|
|||
|
|
return token
|
|||
|
|
|
|||
|
|
best = None
|
|||
|
|
for i in range(1, len(body)):
|
|||
|
|
prefix = body[:i]
|
|||
|
|
if prefix in _COMMANDS_NEED_SPACE:
|
|||
|
|
best = prefix
|
|||
|
|
|
|||
|
|
if not best:
|
|||
|
|
return token
|
|||
|
|
|
|||
|
|
suffix = body[len(best) :]
|
|||
|
|
if not suffix:
|
|||
|
|
return token
|
|||
|
|
|
|||
|
|
return f"\\{best} {suffix}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _fix_ocr_number_errors(expr: str) -> str:
|
|||
|
|
"""Fix common OCR errors in LaTeX math expressions."""
|
|||
|
|
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
|
|||
|
|
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
|
|||
|
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
|
|||
|
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
|
|||
|
|
return expr
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _postprocess_math(expr: str) -> str:
|
|||
|
|
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
|||
|
|
original = expr
|
|||
|
|
|
|||
|
|
# Stage 0: fix OCR number errors
|
|||
|
|
expr = _fix_ocr_number_errors(expr)
|
|||
|
|
stage0 = expr
|
|||
|
|
|
|||
|
|
# Stage 1: split glued command tokens
|
|||
|
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
|||
|
|
stage1 = expr
|
|||
|
|
|
|||
|
|
# Stage 2: normalize differentials
|
|||
|
|
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
|
|||
|
|
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
|
|||
|
|
stage2 = expr
|
|||
|
|
|
|||
|
|
return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]:
|
|||
|
|
"""Apply LaTeX postprocessing to markdown segments."""
|
|||
|
|
if not markdown_content:
|
|||
|
|
return {"original": markdown_content, "final": markdown_content, "segments": []}
|
|||
|
|
|
|||
|
|
segments = []
|
|||
|
|
|
|||
|
|
def _fix_segment(m: re.Match) -> str:
|
|||
|
|
seg = m.group(0)
|
|||
|
|
inner = None
|
|||
|
|
|
|||
|
|
if seg.startswith("$$") and seg.endswith("$$"):
|
|||
|
|
inner = seg[2:-2]
|
|||
|
|
result = _postprocess_math(inner)
|
|||
|
|
segments.append({"type": "display", "original": seg, "processing": result})
|
|||
|
|
return f"$${result['final']}$$"
|
|||
|
|
elif seg.startswith("$") and seg.endswith("$"):
|
|||
|
|
inner = seg[1:-1]
|
|||
|
|
result = _postprocess_math(inner)
|
|||
|
|
segments.append({"type": "inline", "original": seg, "processing": result})
|
|||
|
|
return f"${result['final']}$"
|
|||
|
|
|
|||
|
|
return seg
|
|||
|
|
|
|||
|
|
final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
|||
|
|
|
|||
|
|
return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def diagnose(latex_input: str) -> None:
|
|||
|
|
"""Run diagnostic on LaTeX input."""
|
|||
|
|
print("=" * 80)
|
|||
|
|
print("LaTeX Rendering Diagnostic Tool")
|
|||
|
|
print("=" * 80)
|
|||
|
|
print(f"\nInput: {latex_input}")
|
|||
|
|
print("-" * 80)
|
|||
|
|
|
|||
|
|
# Check if input contains problematic characters
|
|||
|
|
print("\n1. Character Detection:")
|
|||
|
|
if "\\lambda" in latex_input:
|
|||
|
|
print(" ✅ Found \\lambda")
|
|||
|
|
if "\\vdots" in latex_input:
|
|||
|
|
print(" ✅ Found \\vdots")
|
|||
|
|
if "\\cdots" in latex_input:
|
|||
|
|
print(" ℹ️ Found \\cdots (similar to \\vdots)")
|
|||
|
|
if "\\ldots" in latex_input:
|
|||
|
|
print(" ℹ️ Found \\ldots (similar to \\vdots)")
|
|||
|
|
|
|||
|
|
# Run postprocessing pipeline
|
|||
|
|
print("\n2. Postprocessing Pipeline:")
|
|||
|
|
result = _postprocess_markdown(latex_input)
|
|||
|
|
|
|||
|
|
if result["segments"]:
|
|||
|
|
for i, seg in enumerate(result["segments"], 1):
|
|||
|
|
print(f"\n Segment {i} ({seg['type']}):")
|
|||
|
|
print(f" Original: {seg['original']}")
|
|||
|
|
|
|||
|
|
proc = seg["processing"]
|
|||
|
|
|
|||
|
|
# Check each stage for changes
|
|||
|
|
if proc["original"] != proc["after_stage0_numbers"]:
|
|||
|
|
print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}")
|
|||
|
|
else:
|
|||
|
|
print(f" ✅ Stage 0 (numbers): No change")
|
|||
|
|
|
|||
|
|
if proc["after_stage0_numbers"] != proc["after_stage1_commands"]:
|
|||
|
|
print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}")
|
|||
|
|
else:
|
|||
|
|
print(f" ✅ Stage 1 (commands): No change")
|
|||
|
|
|
|||
|
|
if proc["after_stage1_commands"] != proc["after_stage2_differentials"]:
|
|||
|
|
print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}")
|
|||
|
|
else:
|
|||
|
|
print(f" ✅ Stage 2 (differentials): No change")
|
|||
|
|
|
|||
|
|
print(f" Final: {proc['final']}")
|
|||
|
|
else:
|
|||
|
|
print(" ℹ️ No math segments found (not wrapped in $ or $$)")
|
|||
|
|
|
|||
|
|
print("\n3. Final Output:")
|
|||
|
|
print(f" {result['final']}")
|
|||
|
|
|
|||
|
|
if result["changed"]:
|
|||
|
|
print("\n ⚠️ WARNING: The input was modified during postprocessing!")
|
|||
|
|
print(" This could be the cause of rendering issues.")
|
|||
|
|
else:
|
|||
|
|
print("\n ✅ No changes made during postprocessing.")
|
|||
|
|
print(" If rendering fails, the issue is likely in:")
|
|||
|
|
print(" - Pandoc conversion (LaTeX → MathML)")
|
|||
|
|
print(" - Frontend rendering (MathJax/KaTeX)")
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print('Usage: python diagnose_latex_rendering.py "<latex_formula>"')
|
|||
|
|
print("\nExamples:")
|
|||
|
|
print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"')
|
|||
|
|
print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"')
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
latex_input = sys.argv[1]
|
|||
|
|
diagnose(latex_input)
|