fix: post hanlde for ocr

This commit is contained in:
liuyuanchuang
2026-02-07 21:28:46 +08:00
parent f514f98142
commit 1a4d54ce34
2 changed files with 281 additions and 227 deletions

View File

@@ -136,6 +136,7 @@ class Converter:
"""Get cached XSLT transform for MathML to mml: conversion."""
if cls._mml_xslt_transform is None:
from lxml import etree
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
return cls._mml_xslt_transform
@@ -197,14 +198,17 @@ class Converter:
return ConvertResult(latex="", mathml="", mml="")
try:
# Detect if formula is display (block) or inline
is_display = self._is_display_formula(md_text)
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
# Preprocess formula for better conversion (fix array specifiers, etc.)
preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
# Convert to MathML
mathml = self._latex_to_mathml(preprocessed_formula)
# Convert to MathML (pass display flag to use correct delimiters)
mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
@@ -238,18 +242,18 @@ class Converter:
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency across all conversion paths. This fixes common issues that
consistency across all conversion paths. This fixes common issues that
cause Pandoc conversion to fail.
Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
so we don't need to handle them here.
Note: OCR errors (number errors, command spacing) are fixed earlier in the
pipeline (in ocr_service.py), so we don't need to handle them here.
Args:
latex_formula: Pure LaTeX formula.
@@ -259,18 +263,38 @@ class Converter:
"""
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
return latex_formula
def _is_display_formula(self, text: str) -> bool:
"""Check if the formula is a display (block) formula.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
True if display formula ($$...$$ or \\[...\\]), False if inline.
"""
text = text.strip()
# Display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return True
if text.startswith("\\[") and text.endswith("\\]"):
return True
# Inline math delimiters: $...$ or \(...\)
return False
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
@@ -299,18 +323,30 @@ class Converter:
@staticmethod
@lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str:
def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses Pandoc for conversion to ensure Word compatibility.
Pandoc generates standard MathML that Word can properly import.
Uses LRU cache to avoid recomputing for repeated formulas.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
is_display: True if display (block) formula, False if inline.
Returns:
Standard MathML representation.
"""
# Use appropriate delimiters based on formula type
# Display formulas use $$...$$, inline formulas use $...$
if is_display:
pandoc_input = f"$${latex_formula}$$"
else:
pandoc_input = f"${latex_formula}$"
try:
# Use Pandoc for Word-compatible MathML (primary method)
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
pandoc_input,
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
@@ -321,24 +357,23 @@ class Converter:
mathml = match.group(0)
# Post-process for Word compatibility
return Converter._postprocess_mathml_for_word(mathml)
# If no match, return as-is
return mathml_html.rstrip("\n")
# If Pandoc didn't generate MathML (returned HTML instead), use fallback
# This happens when Pandoc's mathml output format is not available or fails
raise ValueError("Pandoc did not generate MathML, got HTML instead")
except Exception as pandoc_error:
# Fallback: try latex2mathml (less Word-compatible)
try:
mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml)
except Exception as e:
raise RuntimeError(
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
) from e
raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
@staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str:
"""Post-process MathML to improve Word compatibility.
Applies transformations to make MathML more compatible and concise:
- Remove <semantics> and <annotation> wrappers (Word doesn't need them)
- Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
@@ -346,32 +381,32 @@ class Converter:
- Change display="inline" to display="block" for better rendering
- Decode Unicode entities to actual characters (Word prefers this)
- Ensure proper namespace
Args:
mathml: MathML string.
Returns:
Simplified, Word-compatible MathML string.
"""
import re
# Step 1: Remove <semantics> and <annotation> wrappers
# These often cause Word import issues
if '<semantics>' in mathml:
if "<semantics>" in mathml:
# Extract content between <semantics> and <annotation>
match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
if match:
content = match.group(1).strip()
# Get the math element attributes
math_attrs = ""
math_match = re.search(r'<math([^>]*)>', mathml)
math_match = re.search(r"<math([^>]*)>", mathml)
if math_match:
math_attrs = math_match.group(1)
# Rebuild without semantics
mathml = f'<math{math_attrs}>{content}</math>'
mathml = f"<math{math_attrs}>{content}</math>"
# Step 2: Remove unnecessary attributes that don't affect rendering
# These are verbose and Word doesn't need them
unnecessary_attrs = [
@@ -390,234 +425,231 @@ class Converter:
r'\s+class="[^"]*"',
r'\s+style="[^"]*"',
]
for attr_pattern in unnecessary_attrs:
mathml = re.sub(attr_pattern, '', mathml)
mathml = re.sub(attr_pattern, "", mathml)
# Step 3: Remove redundant single <mrow> wrapper at the top level
# Pattern: <math ...><mrow>content</mrow></math>
# Simplify to: <math ...>content</math>
mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
match = re.search(mrow_pattern, mathml, re.DOTALL)
if match:
# Check if there's only one mrow at the top level
content = match.group(2)
# Only remove if the content doesn't have other top-level elements
if not re.search(r'</[^>]+>\s*<[^/]', content):
mathml = f'{match.group(1)}{content}{match.group(3)}'
if not re.search(r"</[^>]+>\s*<[^/]", content):
mathml = f"{match.group(1)}{content}{match.group(3)}"
# Step 4: Change display to block for better Word rendering
mathml = mathml.replace('display="inline"', 'display="block"')
# Step 5: If no display attribute, add it
if 'display=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math display="block"', 1)
if "display=" not in mathml and "<math" in mathml:
mathml = mathml.replace("<math", '<math display="block"', 1)
# Step 6: Ensure xmlns is present
if 'xmlns=' not in mathml and '<math' in mathml:
mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
if "xmlns=" not in mathml and "<math" in mathml:
mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
# Step 7: Decode common Unicode entities to actual characters (Word prefers this)
unicode_map = {
# Basic operators
'&#x0002B;': '+',
'&#x0002D;': '-',
'&#x0002A;': '*',
'&#x0002F;': '/',
'&#x0003D;': '=',
'&#x0003C;': '<',
'&#x0003E;': '>',
'&#x00028;': '(',
'&#x00029;': ')',
'&#x0002C;': ',',
'&#x0002E;': '.',
'&#x0007C;': '|',
'&#x00B0;': '°',
'&#x00D7;': '×', # times
'&#x00F7;': '÷', # div
'&#x00B1;': '±', # pm
'&#x2213;': '', # mp
"&#x0002B;": "+",
"&#x0002D;": "-",
"&#x0002A;": "*",
"&#x0002F;": "/",
"&#x0003D;": "=",
"&#x0003C;": "<",
"&#x0003E;": ">",
"&#x00028;": "(",
"&#x00029;": ")",
"&#x0002C;": ",",
"&#x0002E;": ".",
"&#x0007C;": "|",
"&#x00B0;": "°",
"&#x00D7;": "×", # times
"&#x00F7;": "÷", # div
"&#x00B1;": "±", # pm
"&#x2213;": "", # mp
# Ellipsis symbols
'&#x02026;': '', # ldots (horizontal)
'&#x022EE;': '', # vdots (vertical)
'&#x022EF;': '', # cdots (centered)
'&#x022F0;': '', # iddots (diagonal up)
'&#x022F1;': '', # ddots (diagonal down)
"&#x02026;": "", # ldots (horizontal)
"&#x022EE;": "", # vdots (vertical)
"&#x022EF;": "", # cdots (centered)
"&#x022F0;": "", # iddots (diagonal up)
"&#x022F1;": "", # ddots (diagonal down)
# Greek letters (lowercase)
'&#x03B1;': 'α', # alpha
'&#x03B2;': 'β', # beta
'&#x03B3;': 'γ', # gamma
'&#x03B4;': 'δ', # delta
'&#x03B5;': 'ε', # epsilon
'&#x03B6;': 'ζ', # zeta
'&#x03B7;': 'η', # eta
'&#x03B8;': 'θ', # theta
'&#x03B9;': 'ι', # iota
'&#x03BA;': 'κ', # kappa
'&#x03BB;': 'λ', # lambda
'&#x03BC;': 'μ', # mu
'&#x03BD;': 'ν', # nu
'&#x03BE;': 'ξ', # xi
'&#x03BF;': 'ο', # omicron
'&#x03C0;': 'π', # pi
'&#x03C1;': 'ρ', # rho
'&#x03C2;': 'ς', # final sigma
'&#x03C3;': 'σ', # sigma
'&#x03C4;': 'τ', # tau
'&#x03C5;': 'υ', # upsilon
'&#x03C6;': 'φ', # phi
'&#x03C7;': 'χ', # chi
'&#x03C8;': 'ψ', # psi
'&#x03C9;': 'ω', # omega
'&#x03D5;': 'ϕ', # phi variant
"&#x03B1;": "α", # alpha
"&#x03B2;": "β", # beta
"&#x03B3;": "γ", # gamma
"&#x03B4;": "δ", # delta
"&#x03B5;": "ε", # epsilon
"&#x03B6;": "ζ", # zeta
"&#x03B7;": "η", # eta
"&#x03B8;": "θ", # theta
"&#x03B9;": "ι", # iota
"&#x03BA;": "κ", # kappa
"&#x03BB;": "λ", # lambda
"&#x03BC;": "μ", # mu
"&#x03BD;": "ν", # nu
"&#x03BE;": "ξ", # xi
"&#x03BF;": "ο", # omicron
"&#x03C0;": "π", # pi
"&#x03C1;": "ρ", # rho
"&#x03C2;": "ς", # final sigma
"&#x03C3;": "σ", # sigma
"&#x03C4;": "τ", # tau
"&#x03C5;": "υ", # upsilon
"&#x03C6;": "φ", # phi
"&#x03C7;": "χ", # chi
"&#x03C8;": "ψ", # psi
"&#x03C9;": "ω", # omega
"&#x03D5;": "ϕ", # phi variant
# Greek letters (uppercase)
'&#x0391;': 'Α', # Alpha
'&#x0392;': 'Β', # Beta
'&#x0393;': 'Γ', # Gamma
'&#x0394;': 'Δ', # Delta
'&#x0395;': 'Ε', # Epsilon
'&#x0396;': 'Ζ', # Zeta
'&#x0397;': 'Η', # Eta
'&#x0398;': 'Θ', # Theta
'&#x0399;': 'Ι', # Iota
'&#x039A;': 'Κ', # Kappa
'&#x039B;': 'Λ', # Lambda
'&#x039C;': 'Μ', # Mu
'&#x039D;': 'Ν', # Nu
'&#x039E;': 'Ξ', # Xi
'&#x039F;': 'Ο', # Omicron
'&#x03A0;': 'Π', # Pi
'&#x03A1;': 'Ρ', # Rho
'&#x03A3;': 'Σ', # Sigma
'&#x03A4;': 'Τ', # Tau
'&#x03A5;': 'Υ', # Upsilon
'&#x03A6;': 'Φ', # Phi
'&#x03A7;': 'Χ', # Chi
'&#x03A8;': 'Ψ', # Psi
'&#x03A9;': 'Ω', # Omega
"&#x0391;": "Α", # Alpha
"&#x0392;": "Β", # Beta
"&#x0393;": "Γ", # Gamma
"&#x0394;": "Δ", # Delta
"&#x0395;": "Ε", # Epsilon
"&#x0396;": "Ζ", # Zeta
"&#x0397;": "Η", # Eta
"&#x0398;": "Θ", # Theta
"&#x0399;": "Ι", # Iota
"&#x039A;": "Κ", # Kappa
"&#x039B;": "Λ", # Lambda
"&#x039C;": "Μ", # Mu
"&#x039D;": "Ν", # Nu
"&#x039E;": "Ξ", # Xi
"&#x039F;": "Ο", # Omicron
"&#x03A0;": "Π", # Pi
"&#x03A1;": "Ρ", # Rho
"&#x03A3;": "Σ", # Sigma
"&#x03A4;": "Τ", # Tau
"&#x03A5;": "Υ", # Upsilon
"&#x03A6;": "Φ", # Phi
"&#x03A7;": "Χ", # Chi
"&#x03A8;": "Ψ", # Psi
"&#x03A9;": "Ω", # Omega
# Math symbols
'&#x2205;': '', # emptyset
'&#x2208;': '', # in
'&#x2209;': '', # notin
'&#x220B;': '', # ni
'&#x220C;': '', # nni
'&#x2211;': '', # sum
'&#x220F;': '', # prod
'&#x221A;': '', # sqrt
'&#x221B;': '', # cbrt
'&#x221C;': '', # fourthroot
'&#x221E;': '', # infty
'&#x2229;': '', # cap
'&#x222A;': '', # cup
'&#x222B;': '', # int
'&#x222C;': '', # iint
'&#x222D;': '', # iiint
'&#x222E;': '', # oint
'&#x2282;': '', # subset
'&#x2283;': '', # supset
'&#x2284;': '', # nsubset
'&#x2285;': '', # nsupset
'&#x2286;': '', # subseteq
'&#x2287;': '', # supseteq
'&#x2288;': '', # nsubseteq
'&#x2289;': '', # nsupseteq
'&#x2264;': '', # leq
'&#x2265;': '', # geq
'&#x2260;': '', # neq
'&#x2261;': '', # equiv
'&#x2248;': '', # approx
'&#x2243;': '', # simeq
'&#x2245;': '', # cong
'&#x2202;': '', # partial
'&#x2207;': '', # nabla
'&#x2200;': '', # forall
'&#x2203;': '', # exists
'&#x2204;': '', # nexists
'&#x00AC;': '¬', # neg/lnot
'&#x2227;': '', # wedge/land
'&#x2228;': '', # vee/lor
'&#x2192;': '', # to/rightarrow
'&#x2190;': '', # leftarrow
'&#x2194;': '', # leftrightarrow
'&#x21D2;': '', # Rightarrow
'&#x21D0;': '', # Leftarrow
'&#x21D4;': '', # Leftrightarrow
'&#x2191;': '', # uparrow
'&#x2193;': '', # downarrow
'&#x21D1;': '', # Uparrow
'&#x21D3;': '', # Downarrow
'&#x2195;': '', # updownarrow
'&#x21D5;': '', # Updownarrow
'&#x2260;': '', # ne
'&#x226A;': '', # ll
'&#x226B;': '', # gg
'&#x2A7D;': '', # leqslant
'&#x2A7E;': '', # geqslant
'&#x22A5;': '', # perp
'&#x2225;': '', # parallel
'&#x2220;': '', # angle
'&#x25B3;': '', # triangle
'&#x25A1;': '', # square
'&#x25CA;': '', # diamond
'&#x2660;': '', # spadesuit
'&#x2661;': '', # heartsuit
'&#x2662;': '', # diamondsuit
'&#x2663;': '', # clubsuit
'&#x2113;': '', # ell
'&#x2118;': '', # wp (Weierstrass p)
'&#x211C;': '', # Re (real part)
'&#x2111;': '', # Im (imaginary part)
'&#x2135;': '', # aleph
'&#x2136;': '', # beth
"&#x2205;": "", # emptyset
"&#x2208;": "", # in
"&#x2209;": "", # notin
"&#x220B;": "", # ni
"&#x220C;": "", # nni
"&#x2211;": "", # sum
"&#x220F;": "", # prod
"&#x221A;": "", # sqrt
"&#x221B;": "", # cbrt
"&#x221C;": "", # fourthroot
"&#x221E;": "", # infty
"&#x2229;": "", # cap
"&#x222A;": "", # cup
"&#x222B;": "", # int
"&#x222C;": "", # iint
"&#x222D;": "", # iiint
"&#x222E;": "", # oint
"&#x2282;": "", # subset
"&#x2283;": "", # supset
"&#x2284;": "", # nsubset
"&#x2285;": "", # nsupset
"&#x2286;": "", # subseteq
"&#x2287;": "", # supseteq
"&#x2288;": "", # nsubseteq
"&#x2289;": "", # nsupseteq
"&#x2264;": "", # leq
"&#x2265;": "", # geq
"&#x2260;": "", # neq
"&#x2261;": "", # equiv
"&#x2248;": "", # approx
"&#x2243;": "", # simeq
"&#x2245;": "", # cong
"&#x2202;": "", # partial
"&#x2207;": "", # nabla
"&#x2200;": "", # forall
"&#x2203;": "", # exists
"&#x2204;": "", # nexists
"&#x00AC;": "¬", # neg/lnot
"&#x2227;": "", # wedge/land
"&#x2228;": "", # vee/lor
"&#x2192;": "", # to/rightarrow
"&#x2190;": "", # leftarrow
"&#x2194;": "", # leftrightarrow
"&#x21D2;": "", # Rightarrow
"&#x21D0;": "", # Leftarrow
"&#x21D4;": "", # Leftrightarrow
"&#x2191;": "", # uparrow
"&#x2193;": "", # downarrow
"&#x21D1;": "", # Uparrow
"&#x21D3;": "", # Downarrow
"&#x2195;": "", # updownarrow
"&#x21D5;": "", # Updownarrow
"&#x2260;": "", # ne
"&#x226A;": "", # ll
"&#x226B;": "", # gg
"&#x2A7D;": "", # leqslant
"&#x2A7E;": "", # geqslant
"&#x22A5;": "", # perp
"&#x2225;": "", # parallel
"&#x2220;": "", # angle
"&#x25B3;": "", # triangle
"&#x25A1;": "", # square
"&#x25CA;": "", # diamond
"&#x2660;": "", # spadesuit
"&#x2661;": "", # heartsuit
"&#x2662;": "", # diamondsuit
"&#x2663;": "", # clubsuit
"&#x2113;": "", # ell
"&#x2118;": "", # wp (Weierstrass p)
"&#x211C;": "", # Re (real part)
"&#x2111;": "", # Im (imaginary part)
"&#x2135;": "", # aleph
"&#x2136;": "", # beth
}
for entity, char in unicode_map.items():
mathml = mathml.replace(entity, char)
# Also handle decimal entity format (&#NNNN;) for common characters
# Convert decimal to hex-based lookup
decimal_patterns = [
(r'&#955;', 'λ'), # lambda (decimal 955 = hex 03BB)
(r'&#8942;', ''), # vdots (decimal 8942 = hex 22EE)
(r'&#8943;', ''), # cdots (decimal 8943 = hex 22EF)
(r'&#8230;', ''), # ldots (decimal 8230 = hex 2026)
(r'&#8734;', ''), # infty (decimal 8734 = hex 221E)
(r'&#8721;', ''), # sum (decimal 8721 = hex 2211)
(r'&#8719;', ''), # prod (decimal 8719 = hex 220F)
(r'&#8730;', ''), # sqrt (decimal 8730 = hex 221A)
(r'&#8712;', ''), # in (decimal 8712 = hex 2208)
(r'&#8713;', ''), # notin (decimal 8713 = hex 2209)
(r'&#8745;', ''), # cap (decimal 8745 = hex 2229)
(r'&#8746;', ''), # cup (decimal 8746 = hex 222A)
(r'&#8804;', ''), # leq (decimal 8804 = hex 2264)
(r'&#8805;', ''), # geq (decimal 8805 = hex 2265)
(r'&#8800;', ''), # neq (decimal 8800 = hex 2260)
(r'&#8776;', ''), # approx (decimal 8776 = hex 2248)
(r'&#8801;', ''), # equiv (decimal 8801 = hex 2261)
(r"&#955;", "λ"), # lambda (decimal 955 = hex 03BB)
(r"&#8942;", ""), # vdots (decimal 8942 = hex 22EE)
(r"&#8943;", ""), # cdots (decimal 8943 = hex 22EF)
(r"&#8230;", ""), # ldots (decimal 8230 = hex 2026)
(r"&#8734;", ""), # infty (decimal 8734 = hex 221E)
(r"&#8721;", ""), # sum (decimal 8721 = hex 2211)
(r"&#8719;", ""), # prod (decimal 8719 = hex 220F)
(r"&#8730;", ""), # sqrt (decimal 8730 = hex 221A)
(r"&#8712;", ""), # in (decimal 8712 = hex 2208)
(r"&#8713;", ""), # notin (decimal 8713 = hex 2209)
(r"&#8745;", ""), # cap (decimal 8745 = hex 2229)
(r"&#8746;", ""), # cup (decimal 8746 = hex 222A)
(r"&#8804;", ""), # leq (decimal 8804 = hex 2264)
(r"&#8805;", ""), # geq (decimal 8805 = hex 2265)
(r"&#8800;", ""), # neq (decimal 8800 = hex 2260)
(r"&#8776;", ""), # approx (decimal 8776 = hex 2248)
(r"&#8801;", ""), # equiv (decimal 8801 = hex 2261)
]
for pattern, char in decimal_patterns:
mathml = mathml.replace(pattern, char)
# Step 8: Clean up extra whitespace
mathml = re.sub(r'>\s+<', '><', mathml)
mathml = re.sub(r">\s+<", "><", mathml)
return mathml
def _latex_to_mathml(self, latex_formula: str) -> str:
def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
"""Convert LaTeX formula to standard MathML.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
is_display: True if display (block) formula, False if inline.
Returns:
Standard MathML representation.
"""
return self._latex_to_mathml_cached(latex_formula)
return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
def _mathml_to_mml(self, mathml: str) -> str:
"""Convert standard MathML to mml:math format with namespace prefix.

View File

@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
"log",
"ln",
"exp",
# set relations (often glued by OCR)
"in",
"notin",
"subset",
"supset",
"subseteq",
"supseteq",
"cap",
"cup",
# misc
"partial",
"nabla",
}
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
# Match LaTeX commands: \command (greedy match all letters)
# The splitting logic in _split_glued_command_token will handle \inX -> \in X
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
Examples:
- \\cdotdS -> \\cdot dS
- \\intdx -> \\int dx
- \\inX -> \\in X (stop at uppercase letter)
"""
if not token.startswith("\\"):
return token
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
return token
best = None
# longest prefix that is in whitelist
for i in range(1, len(body)):
# Find longest prefix that is in whitelist
for i in range(1, len(body) + 1):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
# Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
# BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
# must be preserved as they serve as command terminators (\in X != \inX)
def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
# Only remove spaces between non-backslash characters
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
if "\\" not in content:
# No LaTeX commands: safe to remove all spaces
cleaned = re.sub(r"\s+", "", content)
else:
# Contains LaTeX commands: remove spaces carefully
# Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
# Strategy: remove spaces before \ and between non-command chars,
# but preserve the space after \command when followed by a non-\ char
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... }
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
Processing stages:
0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables)
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
# stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS)
# stage1: split glued command tokens (e.g. \cdotdS, \inX)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)