Files
doc_processer/diagnose_latex_rendering.py
2026-02-05 20:33:43 +08:00

203 lines
6.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Diagnostic tool for LaTeX rendering issues.
Usage:
python diagnose_latex_rendering.py "\\lambda + \\vdots"
python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$"
"""
import sys
import re
from typing import Dict, Any
# Simulate the OCR postprocessing pipeline
_COMMANDS_NEED_SPACE = {
"cdot",
"times",
"div",
"pm",
"mp",
"int",
"iint",
"iiint",
"oint",
"sum",
"prod",
"lim",
"sin",
"cos",
"tan",
"cot",
"sec",
"csc",
"log",
"ln",
"exp",
"partial",
"nabla",
}
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
def _split_glued_command_token(token: str) -> str:
"""Split OCR-glued LaTeX command token by whitelist longest-prefix."""
if not token.startswith("\\"):
return token
body = token[1:]
if len(body) < 2:
return token
best = None
for i in range(1, len(body)):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
if not best:
return token
suffix = body[len(best) :]
if not suffix:
return token
return f"\\{best} {suffix}"
def _fix_ocr_number_errors(expr: str) -> str:
"""Fix common OCR errors in LaTeX math expressions."""
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
return expr
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
original = expr
# Stage 0: fix OCR number errors
expr = _fix_ocr_number_errors(expr)
stage0 = expr
# Stage 1: split glued command tokens
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
stage1 = expr
# Stage 2: normalize differentials
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
stage2 = expr
return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr}
def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]:
"""Apply LaTeX postprocessing to markdown segments."""
if not markdown_content:
return {"original": markdown_content, "final": markdown_content, "segments": []}
segments = []
def _fix_segment(m: re.Match) -> str:
seg = m.group(0)
inner = None
if seg.startswith("$$") and seg.endswith("$$"):
inner = seg[2:-2]
result = _postprocess_math(inner)
segments.append({"type": "display", "original": seg, "processing": result})
return f"$${result['final']}$$"
elif seg.startswith("$") and seg.endswith("$"):
inner = seg[1:-1]
result = _postprocess_math(inner)
segments.append({"type": "inline", "original": seg, "processing": result})
return f"${result['final']}$"
return seg
final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final}
def diagnose(latex_input: str) -> None:
"""Run diagnostic on LaTeX input."""
print("=" * 80)
print("LaTeX Rendering Diagnostic Tool")
print("=" * 80)
print(f"\nInput: {latex_input}")
print("-" * 80)
# Check if input contains problematic characters
print("\n1. Character Detection:")
if "\\lambda" in latex_input:
print(" ✅ Found \\lambda")
if "\\vdots" in latex_input:
print(" ✅ Found \\vdots")
if "\\cdots" in latex_input:
print(" Found \\cdots (similar to \\vdots)")
if "\\ldots" in latex_input:
print(" Found \\ldots (similar to \\vdots)")
# Run postprocessing pipeline
print("\n2. Postprocessing Pipeline:")
result = _postprocess_markdown(latex_input)
if result["segments"]:
for i, seg in enumerate(result["segments"], 1):
print(f"\n Segment {i} ({seg['type']}):")
print(f" Original: {seg['original']}")
proc = seg["processing"]
# Check each stage for changes
if proc["original"] != proc["after_stage0_numbers"]:
print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}")
else:
print(f" ✅ Stage 0 (numbers): No change")
if proc["after_stage0_numbers"] != proc["after_stage1_commands"]:
print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}")
else:
print(f" ✅ Stage 1 (commands): No change")
if proc["after_stage1_commands"] != proc["after_stage2_differentials"]:
print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}")
else:
print(f" ✅ Stage 2 (differentials): No change")
print(f" Final: {proc['final']}")
else:
print(" No math segments found (not wrapped in $ or $$)")
print("\n3. Final Output:")
print(f" {result['final']}")
if result["changed"]:
print("\n ⚠️ WARNING: The input was modified during postprocessing!")
print(" This could be the cause of rendering issues.")
else:
print("\n ✅ No changes made during postprocessing.")
print(" If rendering fails, the issue is likely in:")
print(" - Pandoc conversion (LaTeX → MathML)")
print(" - Frontend rendering (MathJax/KaTeX)")
print("\n" + "=" * 80)
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Usage: python diagnose_latex_rendering.py "<latex_formula>"')
print("\nExamples:")
print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"')
print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"')
sys.exit(1)
latex_input = sys.argv[1]
diagnose(latex_input)