From 1a4d54ce344fc5cd63e2df2d263e8d420dc22ce8 Mon Sep 17 00:00:00 2001
From: liuyuanchuang <yuanchuang_liu@qingsongchou.com>
Date: Sat, 7 Feb 2026 21:28:46 +0800
Subject: [PATCH] fix: post hanlde for ocr

---
 app/services/converter.py   | 472 +++++++++++++++++++-----------------
 app/services/ocr_service.py |  36 ++-
 2 files changed, 281 insertions(+), 227 deletions(-)

diff --git a/app/services/converter.py b/app/services/converter.py
index b2b02a3..792fac4 100644
--- a/app/services/converter.py
+++ b/app/services/converter.py
@@ -136,6 +136,7 @@ class Converter:
         """Get cached XSLT transform for MathML to mml: conversion."""
         if cls._mml_xslt_transform is None:
             from lxml import etree
+
             xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
             cls._mml_xslt_transform = etree.XSLT(xslt_doc)
         return cls._mml_xslt_transform
@@ -197,14 +198,17 @@ class Converter:
             return ConvertResult(latex="", mathml="", mml="")
 
         try:
+            # Detect if formula is display (block) or inline
+            is_display = self._is_display_formula(md_text)
+
             # Extract the LaTeX formula content (remove delimiters)
             latex_formula = self._extract_latex_formula(md_text)
 
             # Preprocess formula for better conversion (fix array specifiers, etc.)
             preprocessed_formula = self._preprocess_formula_for_conversion(latex_formula)
 
-            # Convert to MathML
-            mathml = self._latex_to_mathml(preprocessed_formula)
+            # Convert to MathML (pass display flag to use correct delimiters)
+            mathml = self._latex_to_mathml(preprocessed_formula, is_display=is_display)
 
             # Convert MathML to mml:math format (with namespace prefix)
             mml = self._mathml_to_mml(mathml)
@@ -238,18 +242,18 @@ class Converter:
 
         # Preprocess formula using the same preprocessing as export
         preprocessed = self._preprocess_formula_for_conversion(latex_formula.strip())
-        
+
         return self._latex_to_omml(preprocessed)
 
     def _preprocess_formula_for_conversion(self, latex_formula: str) -> str:
         """Preprocess LaTeX formula for any conversion (MathML, OMML, etc.).
 
         Applies the same preprocessing steps as preprocess_for_export to ensure
-        consistency across all conversion paths. This fixes common issues that 
+        consistency across all conversion paths. This fixes common issues that
         cause Pandoc conversion to fail.
 
-        Note: OCR number errors are fixed earlier in the pipeline (in ocr_service.py),
-        so we don't need to handle them here.
+        Note: OCR errors (number errors, command spacing) are fixed earlier in the
+        pipeline (in ocr_service.py), so we don't need to handle them here.
 
         Args:
             latex_formula: Pure LaTeX formula.
@@ -259,18 +263,38 @@ class Converter:
         """
         # 1. Convert matrix environments
         latex_formula = self._convert_matrix_environments(latex_formula)
-        
+
         # 2. Fix array column specifiers (remove spaces)
         latex_formula = self._fix_array_column_specifiers(latex_formula)
-        
+
         # 3. Fix brace spacing
         latex_formula = self._fix_brace_spacing(latex_formula)
-        
+
         # 4. Convert special environments (cases, aligned)
         latex_formula = self._convert_special_environments(latex_formula)
-        
+
         return latex_formula
 
+    def _is_display_formula(self, text: str) -> bool:
+        """Check if the formula is a display (block) formula.
+
+        Args:
+            text: Text containing LaTeX formula with delimiters.
+
+        Returns:
+            True if display formula ($$...$$ or \\[...\\]), False if inline.
+        """
+        text = text.strip()
+
+        # Display math delimiters: $$...$$ or \[...\]
+        if text.startswith("$$") and text.endswith("$$"):
+            return True
+        if text.startswith("\\[") and text.endswith("\\]"):
+            return True
+
+        # Inline math delimiters: $...$ or \(...\)
+        return False
+
     def _extract_latex_formula(self, text: str) -> str:
         """Extract LaTeX formula from text by removing delimiters.
 
@@ -299,18 +323,30 @@ class Converter:
 
     @staticmethod
     @lru_cache(maxsize=256)
-    def _latex_to_mathml_cached(latex_formula: str) -> str:
+    def _latex_to_mathml_cached(latex_formula: str, is_display: bool = False) -> str:
         """Cached conversion of LaTeX formula to MathML.
 
         Uses Pandoc for conversion to ensure Word compatibility.
         Pandoc generates standard MathML that Word can properly import.
 
-        Uses LRU cache to avoid recomputing for repeated formulas.
+        Args:
+            latex_formula: Pure LaTeX formula (without delimiters).
+            is_display: True if display (block) formula, False if inline.
+
+        Returns:
+            Standard MathML representation.
         """
+        # Use appropriate delimiters based on formula type
+        # Display formulas use $$...$$, inline formulas use $...$
+        if is_display:
+            pandoc_input = f"$${latex_formula}$$"
+        else:
+            pandoc_input = f"${latex_formula}$"
+
         try:
             # Use Pandoc for Word-compatible MathML (primary method)
             mathml_html = pypandoc.convert_text(
-                f"${latex_formula}$",
+                pandoc_input,
                 "html",
                 format="markdown+tex_math_dollars",
                 extra_args=["--mathml"],
@@ -321,24 +357,23 @@ class Converter:
                 mathml = match.group(0)
                 # Post-process for Word compatibility
                 return Converter._postprocess_mathml_for_word(mathml)
-            
-            # If no match, return as-is
-            return mathml_html.rstrip("\n")
-            
+
+            # If Pandoc didn't generate MathML (returned HTML instead), use fallback
+            # This happens when Pandoc's mathml output format is not available or fails
+            raise ValueError("Pandoc did not generate MathML, got HTML instead")
+
         except Exception as pandoc_error:
             # Fallback: try latex2mathml (less Word-compatible)
             try:
                 mathml = latex_to_mathml(latex_formula)
                 return Converter._postprocess_mathml_for_word(mathml)
             except Exception as e:
-                raise RuntimeError(
-                    f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
-                ) from e
-    
+                raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
+
     @staticmethod
     def _postprocess_mathml_for_word(mathml: str) -> str:
         """Post-process MathML to improve Word compatibility.
-        
+
         Applies transformations to make MathML more compatible and concise:
         - Remove <semantics> and <annotation> wrappers (Word doesn't need them)
         - Remove unnecessary attributes (form, stretchy, fence, columnalign, etc.)
@@ -346,32 +381,32 @@ class Converter:
         - Change display="inline" to display="block" for better rendering
         - Decode Unicode entities to actual characters (Word prefers this)
         - Ensure proper namespace
-        
+
         Args:
             mathml: MathML string.
-            
+
         Returns:
             Simplified, Word-compatible MathML string.
         """
         import re
-        
+
         # Step 1: Remove <semantics> and <annotation> wrappers
         # These often cause Word import issues
-        if '<semantics>' in mathml:
+        if "<semantics>" in mathml:
             # Extract content between <semantics> and <annotation>
-            match = re.search(r'<semantics>(.*?)<annotation', mathml, re.DOTALL)
+            match = re.search(r"<semantics>(.*?)<annotation", mathml, re.DOTALL)
             if match:
                 content = match.group(1).strip()
-                
+
                 # Get the math element attributes
                 math_attrs = ""
-                math_match = re.search(r'<math([^>]*)>', mathml)
+                math_match = re.search(r"<math([^>]*)>", mathml)
                 if math_match:
                     math_attrs = math_match.group(1)
-                
+
                 # Rebuild without semantics
-                mathml = f'<math{math_attrs}>{content}</math>'
-        
+                mathml = f"<math{math_attrs}>{content}</math>"
+
         # Step 2: Remove unnecessary attributes that don't affect rendering
         # These are verbose and Word doesn't need them
         unnecessary_attrs = [
@@ -390,234 +425,231 @@ class Converter:
             r'\s+class="[^"]*"',
             r'\s+style="[^"]*"',
         ]
-        
+
         for attr_pattern in unnecessary_attrs:
-            mathml = re.sub(attr_pattern, '', mathml)
-        
+            mathml = re.sub(attr_pattern, "", mathml)
+
         # Step 3: Remove redundant single <mrow> wrapper at the top level
         # Pattern: <math ...><mrow>content</mrow></math>
         # Simplify to: <math ...>content</math>
-        mrow_pattern = r'(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)'
+        mrow_pattern = r"(<math[^>]*>)\s*<mrow>(.*?)</mrow>\s*(</math>)"
         match = re.search(mrow_pattern, mathml, re.DOTALL)
         if match:
             # Check if there's only one mrow at the top level
             content = match.group(2)
             # Only remove if the content doesn't have other top-level elements
-            if not re.search(r'</[^>]+>\s*<[^/]', content):
-                mathml = f'{match.group(1)}{content}{match.group(3)}'
-        
+            if not re.search(r"</[^>]+>\s*<[^/]", content):
+                mathml = f"{match.group(1)}{content}{match.group(3)}"
+
         # Step 4: Change display to block for better Word rendering
         mathml = mathml.replace('display="inline"', 'display="block"')
-        
+
         # Step 5: If no display attribute, add it
-        if 'display=' not in mathml and '<math' in mathml:
-            mathml = mathml.replace('<math', '<math display="block"', 1)
-        
+        if "display=" not in mathml and "<math" in mathml:
+            mathml = mathml.replace("<math", '<math display="block"', 1)
+
         # Step 6: Ensure xmlns is present
-        if 'xmlns=' not in mathml and '<math' in mathml:
-            mathml = mathml.replace('<math', '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
-        
+        if "xmlns=" not in mathml and "<math" in mathml:
+            mathml = mathml.replace("<math", '<math xmlns="http://www.w3.org/1998/Math/MathML"', 1)
+
         # Step 7: Decode common Unicode entities to actual characters (Word prefers this)
         unicode_map = {
             # Basic operators
-            '&#x0002B;': '+',
-            '&#x0002D;': '-',
-            '&#x0002A;': '*',
-            '&#x0002F;': '/',
-            '&#x0003D;': '=',
-            '&#x0003C;': '<',
-            '&#x0003E;': '>',
-            '&#x00028;': '(',
-            '&#x00029;': ')',
-            '&#x0002C;': ',',
-            '&#x0002E;': '.',
-            '&#x0007C;': '|',
-            '&#x00B0;': '°',
-            '&#x00D7;': '×',  # times
-            '&#x00F7;': '÷',  # div
-            '&#x00B1;': '±',  # pm
-            '&#x2213;': '∓',  # mp
-            
+            "&#x0002B;": "+",
+            "&#x0002D;": "-",
+            "&#x0002A;": "*",
+            "&#x0002F;": "/",
+            "&#x0003D;": "=",
+            "&#x0003C;": "<",
+            "&#x0003E;": ">",
+            "&#x00028;": "(",
+            "&#x00029;": ")",
+            "&#x0002C;": ",",
+            "&#x0002E;": ".",
+            "&#x0007C;": "|",
+            "&#x00B0;": "°",
+            "&#x00D7;": "×",  # times
+            "&#x00F7;": "÷",  # div
+            "&#x00B1;": "±",  # pm
+            "&#x2213;": "∓",  # mp
             # Ellipsis symbols
-            '&#x02026;': '…',  # ldots (horizontal)
-            '&#x022EE;': '⋮',  # vdots (vertical)
-            '&#x022EF;': '⋯',  # cdots (centered)
-            '&#x022F0;': '⋰',  # iddots (diagonal up)
-            '&#x022F1;': '⋱',  # ddots (diagonal down)
-            
+            "&#x02026;": "…",  # ldots (horizontal)
+            "&#x022EE;": "⋮",  # vdots (vertical)
+            "&#x022EF;": "⋯",  # cdots (centered)
+            "&#x022F0;": "⋰",  # iddots (diagonal up)
+            "&#x022F1;": "⋱",  # ddots (diagonal down)
             # Greek letters (lowercase)
-            '&#x03B1;': 'α',  # alpha
-            '&#x03B2;': 'β',  # beta
-            '&#x03B3;': 'γ',  # gamma
-            '&#x03B4;': 'δ',  # delta
-            '&#x03B5;': 'ε',  # epsilon
-            '&#x03B6;': 'ζ',  # zeta
-            '&#x03B7;': 'η',  # eta
-            '&#x03B8;': 'θ',  # theta
-            '&#x03B9;': 'ι',  # iota
-            '&#x03BA;': 'κ',  # kappa
-            '&#x03BB;': 'λ',  # lambda
-            '&#x03BC;': 'μ',  # mu
-            '&#x03BD;': 'ν',  # nu
-            '&#x03BE;': 'ξ',  # xi
-            '&#x03BF;': 'ο',  # omicron
-            '&#x03C0;': 'π',  # pi
-            '&#x03C1;': 'ρ',  # rho
-            '&#x03C2;': 'ς',  # final sigma
-            '&#x03C3;': 'σ',  # sigma
-            '&#x03C4;': 'τ',  # tau
-            '&#x03C5;': 'υ',  # upsilon
-            '&#x03C6;': 'φ',  # phi
-            '&#x03C7;': 'χ',  # chi
-            '&#x03C8;': 'ψ',  # psi
-            '&#x03C9;': 'ω',  # omega
-            '&#x03D5;': 'ϕ',  # phi variant
-            
+            "&#x03B1;": "α",  # alpha
+            "&#x03B2;": "β",  # beta
+            "&#x03B3;": "γ",  # gamma
+            "&#x03B4;": "δ",  # delta
+            "&#x03B5;": "ε",  # epsilon
+            "&#x03B6;": "ζ",  # zeta
+            "&#x03B7;": "η",  # eta
+            "&#x03B8;": "θ",  # theta
+            "&#x03B9;": "ι",  # iota
+            "&#x03BA;": "κ",  # kappa
+            "&#x03BB;": "λ",  # lambda
+            "&#x03BC;": "μ",  # mu
+            "&#x03BD;": "ν",  # nu
+            "&#x03BE;": "ξ",  # xi
+            "&#x03BF;": "ο",  # omicron
+            "&#x03C0;": "π",  # pi
+            "&#x03C1;": "ρ",  # rho
+            "&#x03C2;": "ς",  # final sigma
+            "&#x03C3;": "σ",  # sigma
+            "&#x03C4;": "τ",  # tau
+            "&#x03C5;": "υ",  # upsilon
+            "&#x03C6;": "φ",  # phi
+            "&#x03C7;": "χ",  # chi
+            "&#x03C8;": "ψ",  # psi
+            "&#x03C9;": "ω",  # omega
+            "&#x03D5;": "ϕ",  # phi variant
             # Greek letters (uppercase)
-            '&#x0391;': 'Α',  # Alpha
-            '&#x0392;': 'Β',  # Beta
-            '&#x0393;': 'Γ',  # Gamma
-            '&#x0394;': 'Δ',  # Delta
-            '&#x0395;': 'Ε',  # Epsilon
-            '&#x0396;': 'Ζ',  # Zeta
-            '&#x0397;': 'Η',  # Eta
-            '&#x0398;': 'Θ',  # Theta
-            '&#x0399;': 'Ι',  # Iota
-            '&#x039A;': 'Κ',  # Kappa
-            '&#x039B;': 'Λ',  # Lambda
-            '&#x039C;': 'Μ',  # Mu
-            '&#x039D;': 'Ν',  # Nu
-            '&#x039E;': 'Ξ',  # Xi
-            '&#x039F;': 'Ο',  # Omicron
-            '&#x03A0;': 'Π',  # Pi
-            '&#x03A1;': 'Ρ',  # Rho
-            '&#x03A3;': 'Σ',  # Sigma
-            '&#x03A4;': 'Τ',  # Tau
-            '&#x03A5;': 'Υ',  # Upsilon
-            '&#x03A6;': 'Φ',  # Phi
-            '&#x03A7;': 'Χ',  # Chi
-            '&#x03A8;': 'Ψ',  # Psi
-            '&#x03A9;': 'Ω',  # Omega
-            
+            "&#x0391;": "Α",  # Alpha
+            "&#x0392;": "Β",  # Beta
+            "&#x0393;": "Γ",  # Gamma
+            "&#x0394;": "Δ",  # Delta
+            "&#x0395;": "Ε",  # Epsilon
+            "&#x0396;": "Ζ",  # Zeta
+            "&#x0397;": "Η",  # Eta
+            "&#x0398;": "Θ",  # Theta
+            "&#x0399;": "Ι",  # Iota
+            "&#x039A;": "Κ",  # Kappa
+            "&#x039B;": "Λ",  # Lambda
+            "&#x039C;": "Μ",  # Mu
+            "&#x039D;": "Ν",  # Nu
+            "&#x039E;": "Ξ",  # Xi
+            "&#x039F;": "Ο",  # Omicron
+            "&#x03A0;": "Π",  # Pi
+            "&#x03A1;": "Ρ",  # Rho
+            "&#x03A3;": "Σ",  # Sigma
+            "&#x03A4;": "Τ",  # Tau
+            "&#x03A5;": "Υ",  # Upsilon
+            "&#x03A6;": "Φ",  # Phi
+            "&#x03A7;": "Χ",  # Chi
+            "&#x03A8;": "Ψ",  # Psi
+            "&#x03A9;": "Ω",  # Omega
             # Math symbols
-            '&#x2205;': '∅',  # emptyset
-            '&#x2208;': '∈',  # in
-            '&#x2209;': '∉',  # notin
-            '&#x220B;': '∋',  # ni
-            '&#x220C;': '∌',  # nni
-            '&#x2211;': '∑',  # sum
-            '&#x220F;': '∏',  # prod
-            '&#x221A;': '√',  # sqrt
-            '&#x221B;': '∛',  # cbrt
-            '&#x221C;': '∜',  # fourthroot
-            '&#x221E;': '∞',  # infty
-            '&#x2229;': '∩',  # cap
-            '&#x222A;': '∪',  # cup
-            '&#x222B;': '∫',  # int
-            '&#x222C;': '∬',  # iint
-            '&#x222D;': '∭',  # iiint
-            '&#x222E;': '∮',  # oint
-            '&#x2282;': '⊂',  # subset
-            '&#x2283;': '⊃',  # supset
-            '&#x2284;': '⊄',  # nsubset
-            '&#x2285;': '⊅',  # nsupset
-            '&#x2286;': '⊆',  # subseteq
-            '&#x2287;': '⊇',  # supseteq
-            '&#x2288;': '⊈',  # nsubseteq
-            '&#x2289;': '⊉',  # nsupseteq
-            '&#x2264;': '≤',  # leq
-            '&#x2265;': '≥',  # geq
-            '&#x2260;': '≠',  # neq
-            '&#x2261;': '≡',  # equiv
-            '&#x2248;': '≈',  # approx
-            '&#x2243;': '≃',  # simeq
-            '&#x2245;': '≅',  # cong
-            '&#x2202;': '∂',  # partial
-            '&#x2207;': '∇',  # nabla
-            '&#x2200;': '∀',  # forall
-            '&#x2203;': '∃',  # exists
-            '&#x2204;': '∄',  # nexists
-            '&#x00AC;': '¬',  # neg/lnot
-            '&#x2227;': '∧',  # wedge/land
-            '&#x2228;': '∨',  # vee/lor
-            '&#x2192;': '→',  # to/rightarrow
-            '&#x2190;': '←',  # leftarrow
-            '&#x2194;': '↔',  # leftrightarrow
-            '&#x21D2;': '⇒',  # Rightarrow
-            '&#x21D0;': '⇐',  # Leftarrow
-            '&#x21D4;': '⇔',  # Leftrightarrow
-            '&#x2191;': '↑',  # uparrow
-            '&#x2193;': '↓',  # downarrow
-            '&#x21D1;': '⇑',  # Uparrow
-            '&#x21D3;': '⇓',  # Downarrow
-            '&#x2195;': '↕',  # updownarrow
-            '&#x21D5;': '⇕',  # Updownarrow
-            '&#x2260;': '≠',  # ne
-            '&#x226A;': '≪',  # ll
-            '&#x226B;': '≫',  # gg
-            '&#x2A7D;': '⩽',  # leqslant
-            '&#x2A7E;': '⩾',  # geqslant
-            '&#x22A5;': '⊥',  # perp
-            '&#x2225;': '∥',  # parallel
-            '&#x2220;': '∠',  # angle
-            '&#x25B3;': '△',  # triangle
-            '&#x25A1;': '□',  # square
-            '&#x25CA;': '◊',  # diamond
-            '&#x2660;': '♠',  # spadesuit
-            '&#x2661;': '♡',  # heartsuit
-            '&#x2662;': '♢',  # diamondsuit
-            '&#x2663;': '♣',  # clubsuit
-            '&#x2113;': 'ℓ',  # ell
-            '&#x2118;': '℘',  # wp (Weierstrass p)
-            '&#x211C;': 'ℜ',  # Re (real part)
-            '&#x2111;': 'ℑ',  # Im (imaginary part)
-            '&#x2135;': 'ℵ',  # aleph
-            '&#x2136;': 'ℶ',  # beth
+            "&#x2205;": "∅",  # emptyset
+            "&#x2208;": "∈",  # in
+            "&#x2209;": "∉",  # notin
+            "&#x220B;": "∋",  # ni
+            "&#x220C;": "∌",  # nni
+            "&#x2211;": "∑",  # sum
+            "&#x220F;": "∏",  # prod
+            "&#x221A;": "√",  # sqrt
+            "&#x221B;": "∛",  # cbrt
+            "&#x221C;": "∜",  # fourthroot
+            "&#x221E;": "∞",  # infty
+            "&#x2229;": "∩",  # cap
+            "&#x222A;": "∪",  # cup
+            "&#x222B;": "∫",  # int
+            "&#x222C;": "∬",  # iint
+            "&#x222D;": "∭",  # iiint
+            "&#x222E;": "∮",  # oint
+            "&#x2282;": "⊂",  # subset
+            "&#x2283;": "⊃",  # supset
+            "&#x2284;": "⊄",  # nsubset
+            "&#x2285;": "⊅",  # nsupset
+            "&#x2286;": "⊆",  # subseteq
+            "&#x2287;": "⊇",  # supseteq
+            "&#x2288;": "⊈",  # nsubseteq
+            "&#x2289;": "⊉",  # nsupseteq
+            "&#x2264;": "≤",  # leq
+            "&#x2265;": "≥",  # geq
+            "&#x2260;": "≠",  # neq
+            "&#x2261;": "≡",  # equiv
+            "&#x2248;": "≈",  # approx
+            "&#x2243;": "≃",  # simeq
+            "&#x2245;": "≅",  # cong
+            "&#x2202;": "∂",  # partial
+            "&#x2207;": "∇",  # nabla
+            "&#x2200;": "∀",  # forall
+            "&#x2203;": "∃",  # exists
+            "&#x2204;": "∄",  # nexists
+            "&#x00AC;": "¬",  # neg/lnot
+            "&#x2227;": "∧",  # wedge/land
+            "&#x2228;": "∨",  # vee/lor
+            "&#x2192;": "→",  # to/rightarrow
+            "&#x2190;": "←",  # leftarrow
+            "&#x2194;": "↔",  # leftrightarrow
+            "&#x21D2;": "⇒",  # Rightarrow
+            "&#x21D0;": "⇐",  # Leftarrow
+            "&#x21D4;": "⇔",  # Leftrightarrow
+            "&#x2191;": "↑",  # uparrow
+            "&#x2193;": "↓",  # downarrow
+            "&#x21D1;": "⇑",  # Uparrow
+            "&#x21D3;": "⇓",  # Downarrow
+            "&#x2195;": "↕",  # updownarrow
+            "&#x21D5;": "⇕",  # Updownarrow
+            "&#x2260;": "≠",  # ne
+            "&#x226A;": "≪",  # ll
+            "&#x226B;": "≫",  # gg
+            "&#x2A7D;": "⩽",  # leqslant
+            "&#x2A7E;": "⩾",  # geqslant
+            "&#x22A5;": "⊥",  # perp
+            "&#x2225;": "∥",  # parallel
+            "&#x2220;": "∠",  # angle
+            "&#x25B3;": "△",  # triangle
+            "&#x25A1;": "□",  # square
+            "&#x25CA;": "◊",  # diamond
+            "&#x2660;": "♠",  # spadesuit
+            "&#x2661;": "♡",  # heartsuit
+            "&#x2662;": "♢",  # diamondsuit
+            "&#x2663;": "♣",  # clubsuit
+            "&#x2113;": "ℓ",  # ell
+            "&#x2118;": "℘",  # wp (Weierstrass p)
+            "&#x211C;": "ℜ",  # Re (real part)
+            "&#x2111;": "ℑ",  # Im (imaginary part)
+            "&#x2135;": "ℵ",  # aleph
+            "&#x2136;": "ℶ",  # beth
         }
-        
+
         for entity, char in unicode_map.items():
             mathml = mathml.replace(entity, char)
-        
+
         # Also handle decimal entity format (&#NNNN;) for common characters
         # Convert decimal to hex-based lookup
         decimal_patterns = [
-            (r'&#955;', 'λ'),    # lambda (decimal 955 = hex 03BB)
-            (r'&#8942;', '⋮'),   # vdots (decimal 8942 = hex 22EE)
-            (r'&#8943;', '⋯'),   # cdots (decimal 8943 = hex 22EF)
-            (r'&#8230;', '…'),   # ldots (decimal 8230 = hex 2026)
-            (r'&#8734;', '∞'),   # infty (decimal 8734 = hex 221E)
-            (r'&#8721;', '∑'),   # sum (decimal 8721 = hex 2211)
-            (r'&#8719;', '∏'),   # prod (decimal 8719 = hex 220F)
-            (r'&#8730;', '√'),   # sqrt (decimal 8730 = hex 221A)
-            (r'&#8712;', '∈'),   # in (decimal 8712 = hex 2208)
-            (r'&#8713;', '∉'),   # notin (decimal 8713 = hex 2209)
-            (r'&#8745;', '∩'),   # cap (decimal 8745 = hex 2229)
-            (r'&#8746;', '∪'),   # cup (decimal 8746 = hex 222A)
-            (r'&#8804;', '≤'),   # leq (decimal 8804 = hex 2264)
-            (r'&#8805;', '≥'),   # geq (decimal 8805 = hex 2265)
-            (r'&#8800;', '≠'),   # neq (decimal 8800 = hex 2260)
-            (r'&#8776;', '≈'),   # approx (decimal 8776 = hex 2248)
-            (r'&#8801;', '≡'),   # equiv (decimal 8801 = hex 2261)
+            (r"&#955;", "λ"),  # lambda (decimal 955 = hex 03BB)
+            (r"&#8942;", "⋮"),  # vdots (decimal 8942 = hex 22EE)
+            (r"&#8943;", "⋯"),  # cdots (decimal 8943 = hex 22EF)
+            (r"&#8230;", "…"),  # ldots (decimal 8230 = hex 2026)
+            (r"&#8734;", "∞"),  # infty (decimal 8734 = hex 221E)
+            (r"&#8721;", "∑"),  # sum (decimal 8721 = hex 2211)
+            (r"&#8719;", "∏"),  # prod (decimal 8719 = hex 220F)
+            (r"&#8730;", "√"),  # sqrt (decimal 8730 = hex 221A)
+            (r"&#8712;", "∈"),  # in (decimal 8712 = hex 2208)
+            (r"&#8713;", "∉"),  # notin (decimal 8713 = hex 2209)
+            (r"&#8745;", "∩"),  # cap (decimal 8745 = hex 2229)
+            (r"&#8746;", "∪"),  # cup (decimal 8746 = hex 222A)
+            (r"&#8804;", "≤"),  # leq (decimal 8804 = hex 2264)
+            (r"&#8805;", "≥"),  # geq (decimal 8805 = hex 2265)
+            (r"&#8800;", "≠"),  # neq (decimal 8800 = hex 2260)
+            (r"&#8776;", "≈"),  # approx (decimal 8776 = hex 2248)
+            (r"&#8801;", "≡"),  # equiv (decimal 8801 = hex 2261)
         ]
-        
+
         for pattern, char in decimal_patterns:
             mathml = mathml.replace(pattern, char)
-        
+
         # Step 8: Clean up extra whitespace
-        mathml = re.sub(r'>\s+<', '><', mathml)
-        
+        mathml = re.sub(r">\s+<", "><", mathml)
+
         return mathml
 
-    def _latex_to_mathml(self, latex_formula: str) -> str:
+    def _latex_to_mathml(self, latex_formula: str, is_display: bool = False) -> str:
         """Convert LaTeX formula to standard MathML.
 
         Args:
             latex_formula: Pure LaTeX formula (without delimiters).
+            is_display: True if display (block) formula, False if inline.
 
         Returns:
             Standard MathML representation.
         """
-        return self._latex_to_mathml_cached(latex_formula)
+        return self._latex_to_mathml_cached(latex_formula, is_display=is_display)
 
     def _mathml_to_mml(self, mathml: str) -> str:
         """Convert standard MathML to mml:math format with namespace prefix.
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
index 2981052..18f5b85 100644
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -41,12 +41,23 @@ _COMMANDS_NEED_SPACE = {
     "log",
     "ln",
     "exp",
+    # set relations (often glued by OCR)
+    "in",
+    "notin",
+    "subset",
+    "supset",
+    "subseteq",
+    "supseteq",
+    "cap",
+    "cup",
     # misc
     "partial",
     "nabla",
 }
 
 _MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
+# Match LaTeX commands: \command (greedy match all letters)
+# The splitting logic in _split_glued_command_token will handle \inX -> \in X
 _COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
 
 # stage2: differentials inside math segments
@@ -65,6 +76,7 @@ def _split_glued_command_token(token: str) -> str:
     Examples:
     - \\cdotdS -> \\cdot dS
     - \\intdx  -> \\int dx
+    - \\inX    -> \\in X (stop at uppercase letter)
     """
     if not token.startswith("\\"):
         return token
@@ -74,8 +86,8 @@ def _split_glued_command_token(token: str) -> str:
         return token
 
     best = None
-    # longest prefix that is in whitelist
-    for i in range(1, len(body)):
+    # Find longest prefix that is in whitelist
+    for i in range(1, len(body) + 1):
         prefix = body[:i]
         if prefix in _COMMANDS_NEED_SPACE:
             best = prefix
@@ -117,12 +129,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
     # Pattern 2: Spaces inside braces that follow _ or ^
     # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
     # This is safe because spaces inside subscript/superscript braces are usually OCR errors
+    # BUT: if content contains LaTeX commands (\in, \alpha, etc.), spaces after them
+    # must be preserved as they serve as command terminators (\in X != \inX)
     def clean_subscript_superscript_braces(match):
         operator = match.group(1)  # _ or ^
         content = match.group(2)  # content inside braces
-        # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
-        # Only remove spaces between non-backslash characters
-        cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
+        if "\\" not in content:
+            # No LaTeX commands: safe to remove all spaces
+            cleaned = re.sub(r"\s+", "", content)
+        else:
+            # Contains LaTeX commands: remove spaces carefully
+            # Keep spaces that follow a LaTeX command (e.g., \in X must keep the space)
+            # Remove spaces everywhere else (e.g., x \in -> x\in is fine)
+            # Strategy: remove spaces before \ and between non-command chars,
+            # but preserve the space after \command when followed by a non-\ char
+            cleaned = re.sub(r"\s+(?=\\)", "", content)       # remove space before \cmd
+            cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned)  # remove space after non-letter non-\
         return f"{operator}{{{cleaned}}}"
 
     # Match _{ ... } or ^{ ... }
@@ -156,7 +178,7 @@ def _postprocess_math(expr: str) -> str:
 
     Processing stages:
     0. Fix OCR number errors (spaces in numbers)
-    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
+    1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS, \\inX -> \\in X)
     2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
     3. Normalize differentials (DISABLED by default to avoid breaking variables)
 
@@ -169,7 +191,7 @@ def _postprocess_math(expr: str) -> str:
     # stage0: fix OCR number errors (digits with spaces)
     expr = _fix_ocr_number_errors(expr)
 
-    # stage1: split glued command tokens (e.g. \cdotdS)
+    # stage1: split glued command tokens (e.g. \cdotdS, \inX)
     expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
 
     # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)