feat: add paddleocr-vl

This commit is contained in:
liuyuanchuang
2026-02-05 20:32:26 +08:00
parent 767006ee38
commit 4de9aefa68
4 changed files with 351 additions and 308 deletions

View File

@@ -5,6 +5,7 @@ import numpy as np
import cv2 import cv2
import requests import requests
from io import BytesIO from io import BytesIO
import base64
from app.core.config import get_settings from app.core.config import get_settings
from paddleocr import PaddleOCRVL from paddleocr import PaddleOCRVL
from typing import Optional from typing import Optional
@@ -12,6 +13,7 @@ from app.services.layout_detector import LayoutDetector
from app.services.image_processor import ImageProcessor from app.services.image_processor import ImageProcessor
from app.services.converter import Converter from app.services.converter import Converter
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from openai import OpenAI
settings = get_settings() settings = get_settings()
@@ -90,42 +92,42 @@ def _split_glued_command_token(token: str) -> str:
def _clean_latex_syntax_spaces(expr: str) -> str: def _clean_latex_syntax_spaces(expr: str) -> str:
"""Clean unwanted spaces in LaTeX syntax (common OCR errors). """Clean unwanted spaces in LaTeX syntax (common OCR errors).
OCR often adds spaces in LaTeX syntax structures where they shouldn't be: OCR often adds spaces in LaTeX syntax structures where they shouldn't be:
- Subscripts: a _ {i 1} -> a_{i1} - Subscripts: a _ {i 1} -> a_{i1}
- Superscripts: x ^ {2 3} -> x^{23} - Superscripts: x ^ {2 3} -> x^{23}
- Fractions: \\frac { a } { b } -> \\frac{a}{b} - Fractions: \\frac { a } { b } -> \\frac{a}{b}
- Commands: \\ alpha -> \\alpha - Commands: \\ alpha -> \\alpha
- Braces: { a b } -> {ab} (within subscripts/superscripts) - Braces: { a b } -> {ab} (within subscripts/superscripts)
This is safe because these spaces are always OCR errors - LaTeX doesn't This is safe because these spaces are always OCR errors - LaTeX doesn't
need or want spaces in these positions. need or want spaces in these positions.
Args: Args:
expr: LaTeX math expression. expr: LaTeX math expression.
Returns: Returns:
Expression with LaTeX syntax spaces cleaned. Expression with LaTeX syntax spaces cleaned.
""" """
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators) # Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
# a _ {i} -> a_{i}, x ^ {2} -> x^{2} # a _ {i} -> a_{i}, x ^ {2} -> x^{2}
expr = re.sub(r'\s*_\s*', '_', expr) expr = re.sub(r"\s*_\s*", "_", expr)
expr = re.sub(r'\s*\^\s*', '^', expr) expr = re.sub(r"\s*\^\s*", "^", expr)
# Pattern 2: Spaces inside braces that follow _ or ^ # Pattern 2: Spaces inside braces that follow _ or ^
# _{i 1} -> _{i1}, ^{2 3} -> ^{23} # _{i 1} -> _{i1}, ^{2 3} -> ^{23}
# This is safe because spaces inside subscript/superscript braces are usually OCR errors # This is safe because spaces inside subscript/superscript braces are usually OCR errors
def clean_subscript_superscript_braces(match): def clean_subscript_superscript_braces(match):
operator = match.group(1) # _ or ^ operator = match.group(1) # _ or ^
content = match.group(2) # content inside braces content = match.group(2) # content inside braces
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta) # Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
# Only remove spaces between non-backslash characters # Only remove spaces between non-backslash characters
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content) cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
return f"{operator}{{{cleaned}}}" return f"{operator}{{{cleaned}}}"
# Match _{ ... } or ^{ ... } # Match _{ ... } or ^{ ... }
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr) expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
# Pattern 3: Spaces inside \frac arguments # Pattern 3: Spaces inside \frac arguments
# \frac { a } { b } -> \frac{a}{b} # \frac { a } { b } -> \frac{a}{b}
# \frac{ a + b }{ c } -> \frac{a+b}{c} # \frac{ a + b }{ c } -> \frac{a+b}{c}
@@ -133,47 +135,46 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
numerator = match.group(1).strip() numerator = match.group(1).strip()
denominator = match.group(2).strip() denominator = match.group(2).strip()
return f"\\frac{{{numerator}}}{{{denominator}}}" return f"\\frac{{{numerator}}}{{{denominator}}}"
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}', expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
clean_frac_braces, expr)
# Pattern 4: Spaces after backslash in LaTeX commands # Pattern 4: Spaces after backslash in LaTeX commands
# \ alpha -> \alpha, \ beta -> \beta # \ alpha -> \alpha, \ beta -> \beta
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr) expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
# Pattern 5: Spaces before/after braces in general contexts (conservative) # Pattern 5: Spaces before/after braces in general contexts (conservative)
# Only remove if the space is clearly wrong (e.g., after operators) # Only remove if the space is clearly wrong (e.g., after operators)
# { x } in standalone context is kept as-is to avoid breaking valid spacing # { x } in standalone context is kept as-is to avoid breaking valid spacing
# But after operators like \sqrt{ x } -> \sqrt{x} # But after operators like \sqrt{ x } -> \sqrt{x}
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr) # \sqrt { -> \sqrt{ expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{
return expr return expr
def _postprocess_math(expr: str) -> str: def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$). """Postprocess a *math* expression (already inside $...$ or $$...$$).
Processing stages: Processing stages:
0. Fix OCR number errors (spaces in numbers) 0. Fix OCR number errors (spaces in numbers)
1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS) 1. Split glued LaTeX commands (e.g., \\cdotdS -> \\cdot dS)
2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1}) 2. Clean LaTeX syntax spaces (e.g., a _ {i 1} -> a_{i1})
3. Normalize differentials (DISABLED by default to avoid breaking variables) 3. Normalize differentials (DISABLED by default to avoid breaking variables)
Args: Args:
expr: LaTeX math expression without delimiters. expr: LaTeX math expression without delimiters.
Returns: Returns:
Processed LaTeX expression. Processed LaTeX expression.
""" """
# stage0: fix OCR number errors (digits with spaces) # stage0: fix OCR number errors (digits with spaces)
expr = _fix_ocr_number_errors(expr) expr = _fix_ocr_number_errors(expr)
# stage1: split glued command tokens (e.g. \cdotdS) # stage1: split glued command tokens (e.g. \cdotdS)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr) expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces) # stage2: clean LaTeX syntax spaces (OCR often adds unwanted spaces)
expr = _clean_latex_syntax_spaces(expr) expr = _clean_latex_syntax_spaces(expr)
# stage3: normalize differentials - DISABLED # stage3: normalize differentials - DISABLED
# This feature is disabled because it's too aggressive and can break: # This feature is disabled because it's too aggressive and can break:
# - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc. # - LaTeX commands containing 'd': \vdots, \lambda (via subscripts), \delta, etc.
@@ -186,40 +187,36 @@ def _postprocess_math(expr: str) -> str:
# #
# If differential normalization is needed, implement a context-aware version: # If differential normalization is needed, implement a context-aware version:
# expr = _normalize_differentials_contextaware(expr) # expr = _normalize_differentials_contextaware(expr)
return expr return expr
def _normalize_differentials_contextaware(expr: str) -> str: def _normalize_differentials_contextaware(expr: str) -> str:
"""Context-aware differential normalization (optional, not used by default). """Context-aware differential normalization (optional, not used by default).
Only normalizes differentials in specific mathematical contexts: Only normalizes differentials in specific mathematical contexts:
1. After integral symbols: \\int dx, \\iint dA, \\oint dr 1. After integral symbols: \\int dx, \\iint dA, \\oint dr
2. In fraction denominators: \\frac{dy}{dx} 2. In fraction denominators: \\frac{dy}{dx}
3. In explicit differential notation: f(x)dx (function followed by differential) 3. In explicit differential notation: f(x)dx (function followed by differential)
This avoids false positives like variable names, subscripts, or LaTeX commands. This avoids false positives like variable names, subscripts, or LaTeX commands.
Args: Args:
expr: LaTeX math expression. expr: LaTeX math expression.
Returns: Returns:
Expression with differentials normalized in safe contexts only. Expression with differentials normalized in safe contexts only.
""" """
# Pattern 1: After integral commands # Pattern 1: After integral commands
# \int dx -> \int d x # \int dx -> \int d x
integral_pattern = re.compile( integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])' expr = integral_pattern.sub(r"\1 \2 d \3", expr)
)
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
# Pattern 2: In fraction denominators # Pattern 2: In fraction denominators
# \frac{...}{dx} -> \frac{...}{d x} # \frac{...}{dx} -> \frac{...}{d x}
frac_pattern = re.compile( frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})' expr = frac_pattern.sub(r"\1d \2\3", expr)
)
expr = frac_pattern.sub(r'\1d \2\3', expr)
return expr return expr
@@ -241,21 +238,21 @@ def _fix_ocr_number_errors(expr: str) -> str:
""" """
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)" # Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
# Example: "2 2. 2" → "22.2" # Example: "2 2. 2" → "22.2"
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr) expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)" # Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
# Example: "22. 2" → "22.2" # Example: "22. 2" → "22.2"
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr) expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
# Fix pattern 3: "digit space digit" (no decimal point, within same number context) # Fix pattern 3: "digit space digit" (no decimal point, within same number context)
# Be careful: only merge if followed by decimal point or comma/end # Be careful: only merge if followed by decimal point or comma/end
# Example: "1 5 0" → "150" when followed by comma or end # Example: "1 5 0" → "150" when followed by comma or end
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr) expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
# Fix pattern 4: Multiple spaces in decimal numbers # Fix pattern 4: Multiple spaces in decimal numbers
# Example: "2 2 . 2" → "22.2" # Example: "2 2 . 2" → "22.2"
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr) expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
return expr return expr
@@ -273,76 +270,76 @@ def _postprocess_markdown(markdown_content: str) -> str:
return seg return seg
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content) markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
# Apply markdown-level postprocessing (after LaTeX processing) # Apply markdown-level postprocessing (after LaTeX processing)
markdown_content = _remove_false_heading_from_single_formula(markdown_content) markdown_content = _remove_false_heading_from_single_formula(markdown_content)
return markdown_content return markdown_content
def _remove_false_heading_from_single_formula(markdown_content: str) -> str: def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
"""Remove false heading markers from single-formula content. """Remove false heading markers from single-formula content.
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix. OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
This function detects and removes the heading marker when: This function detects and removes the heading marker when:
1. The content contains only one formula (display or inline) 1. The content contains only one formula (display or inline)
2. The formula line starts with '#' (heading marker) 2. The formula line starts with '#' (heading marker)
3. No other non-formula text content exists 3. No other non-formula text content exists
Examples: Examples:
Input: "# $$E = mc^2$$" Input: "# $$E = mc^2$$"
Output: "$$E = mc^2$$" Output: "$$E = mc^2$$"
Input: "# $x = y$" Input: "# $x = y$"
Output: "$x = y$" Output: "$x = y$"
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading) Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
Output: "# Introduction\n$$E = mc^2$$" Output: "# Introduction\n$$E = mc^2$$"
Args: Args:
markdown_content: Markdown text with potential false headings. markdown_content: Markdown text with potential false headings.
Returns: Returns:
Markdown text with false heading markers removed. Markdown text with false heading markers removed.
""" """
if not markdown_content or not markdown_content.strip(): if not markdown_content or not markdown_content.strip():
return markdown_content return markdown_content
lines = markdown_content.split('\n') lines = markdown_content.split("\n")
# Count formulas and heading lines # Count formulas and heading lines
formula_count = 0 formula_count = 0
heading_lines = [] heading_lines = []
has_non_formula_text = False has_non_formula_text = False
for i, line in enumerate(lines): for i, line in enumerate(lines):
line_stripped = line.strip() line_stripped = line.strip()
if not line_stripped: if not line_stripped:
continue continue
# Check if line starts with heading marker # Check if line starts with heading marker
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped) heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
if heading_match: if heading_match:
heading_level = heading_match.group(1) heading_level = heading_match.group(1)
content = heading_match.group(2) content = heading_match.group(2)
# Check if the heading content is a formula # Check if the heading content is a formula
if re.fullmatch(r'\$\$?.+\$\$?', content): if re.fullmatch(r"\$\$?.+\$\$?", content):
# This is a heading with a formula # This is a heading with a formula
heading_lines.append((i, heading_level, content)) heading_lines.append((i, heading_level, content))
formula_count += 1 formula_count += 1
else: else:
# This is a real heading with text # This is a real heading with text
has_non_formula_text = True has_non_formula_text = True
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped): elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
# Standalone formula line (not in a heading) # Standalone formula line (not in a heading)
formula_count += 1 formula_count += 1
elif line_stripped and not re.match(r'^#+\s*$', line_stripped): elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
# Non-empty, non-heading, non-formula line # Non-empty, non-heading, non-formula line
has_non_formula_text = True has_non_formula_text = True
# Only remove heading markers if: # Only remove heading markers if:
# 1. There's exactly one formula # 1. There's exactly one formula
# 2. That formula is in a heading line # 2. That formula is in a heading line
@@ -351,8 +348,8 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
# Remove the heading marker from the formula # Remove the heading marker from the formula
line_idx, heading_level, formula_content = heading_lines[0] line_idx, heading_level, formula_content = heading_lines[0]
lines[line_idx] = formula_content lines[line_idx] = formula_content
return '\n'.join(lines) return "\n".join(lines)
class OCRServiceBase(ABC): class OCRServiceBase(ABC):
@@ -492,16 +489,87 @@ class MineruOCRService(OCRServiceBase):
api_url: str = "http://127.0.0.1:8000/file_parse", api_url: str = "http://127.0.0.1:8000/file_parse",
image_processor: Optional[ImageProcessor] = None, image_processor: Optional[ImageProcessor] = None,
converter: Optional[Converter] = None, converter: Optional[Converter] = None,
paddleocr_vl_url: str = "http://localhost:8000/v1",
): ):
"""Initialize Local API service. """Initialize Local API service.
Args: Args:
api_url: URL of the local file_parse API endpoint. api_url: URL of the local file_parse API endpoint.
converter: Optional converter instance for format conversion. converter: Optional converter instance for format conversion.
paddleocr_vl_url: URL of the PaddleOCR-VL vLLM server.
""" """
self.api_url = api_url self.api_url = api_url
self.image_processor = image_processor self.image_processor = image_processor
self.converter = converter self.converter = converter
self.paddleocr_vl_url = paddleocr_vl_url
self.openai_client = OpenAI(api_key="EMPTY", base_url=paddleocr_vl_url, timeout=3600)
def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
"""Recognize formula using PaddleOCR-VL API.
Args:
image: Input image as numpy array in BGR format.
prompt: Recognition prompt (default: "Formula Recognition:")
Returns:
Recognized formula text (LaTeX format).
"""
try:
# Encode image to base64
success, encoded_image = cv2.imencode(".png", image)
if not success:
raise RuntimeError("Failed to encode image")
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
image_url = f"data:image/png;base64,{image_base64}"
# Call OpenAI-compatible API
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
response = self.openai_client.chat.completions.create(
model="PaddlePaddle/PaddleOCR-VL",
messages=messages,
temperature=0.0,
)
return response.choices[0].message.content
except Exception as e:
raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
"""Extract image references from markdown and recognize formulas.
Args:
markdown_content: Markdown content with potential image references.
original_image: Original input image.
Returns:
Markdown content with formulas recognized by PaddleOCR-VL.
"""
# Pattern to match image references: ![](images/xxx.png)
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
if not image_pattern.search(markdown_content):
return markdown_content
try:
# For now, use the entire image for formula recognition
# TODO: Extract specific regions if image paths contain coordinates
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
# Replace image references with recognized formulas
# Wrap in display math delimiters if not already wrapped
if not formula_text.startswith("$$"):
formula_text = f"$${formula_text}$$"
markdown_content = image_pattern.sub(formula_text, markdown_content)
except Exception as e:
# If formula recognition fails, keep original content
print(f"Warning: Formula recognition failed: {e}")
return markdown_content
def recognize(self, image: np.ndarray) -> dict: def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using local file_parse API. """Recognize content using local file_parse API.
@@ -554,6 +622,11 @@ class MineruOCRService(OCRServiceBase):
if "results" in result and "image" in result["results"]: if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "") markdown_content = result["results"]["image"].get("md_content", "")
# Check if markdown contains formula image references
if "![](images/" in markdown_content:
# Use PaddleOCR-VL to recognize the formula
markdown_content = self._extract_and_recognize_formulas(markdown_content, image)
# Apply postprocessing to fix OCR errors # Apply postprocessing to fix OCR errors
markdown_content = _postprocess_markdown(markdown_content) markdown_content = _postprocess_markdown(markdown_content)

202
diagnose_latex_rendering.py Normal file
View File

@@ -0,0 +1,202 @@
"""Diagnostic tool for LaTeX rendering issues.
Usage:
python diagnose_latex_rendering.py "\\lambda + \\vdots"
python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$"
"""
import sys
import re
from typing import Dict, Any
# Simulate the OCR postprocessing pipeline
_COMMANDS_NEED_SPACE = {
"cdot",
"times",
"div",
"pm",
"mp",
"int",
"iint",
"iiint",
"oint",
"sum",
"prod",
"lim",
"sin",
"cos",
"tan",
"cot",
"sec",
"csc",
"log",
"ln",
"exp",
"partial",
"nabla",
}
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
def _split_glued_command_token(token: str) -> str:
"""Split OCR-glued LaTeX command token by whitelist longest-prefix."""
if not token.startswith("\\"):
return token
body = token[1:]
if len(body) < 2:
return token
best = None
for i in range(1, len(body)):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
if not best:
return token
suffix = body[len(best) :]
if not suffix:
return token
return f"\\{best} {suffix}"
def _fix_ocr_number_errors(expr: str) -> str:
"""Fix common OCR errors in LaTeX math expressions."""
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
return expr
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
original = expr
# Stage 0: fix OCR number errors
expr = _fix_ocr_number_errors(expr)
stage0 = expr
# Stage 1: split glued command tokens
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
stage1 = expr
# Stage 2: normalize differentials
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
stage2 = expr
return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr}
def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]:
"""Apply LaTeX postprocessing to markdown segments."""
if not markdown_content:
return {"original": markdown_content, "final": markdown_content, "segments": []}
segments = []
def _fix_segment(m: re.Match) -> str:
seg = m.group(0)
inner = None
if seg.startswith("$$") and seg.endswith("$$"):
inner = seg[2:-2]
result = _postprocess_math(inner)
segments.append({"type": "display", "original": seg, "processing": result})
return f"$${result['final']}$$"
elif seg.startswith("$") and seg.endswith("$"):
inner = seg[1:-1]
result = _postprocess_math(inner)
segments.append({"type": "inline", "original": seg, "processing": result})
return f"${result['final']}$"
return seg
final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final}
def diagnose(latex_input: str) -> None:
"""Run diagnostic on LaTeX input."""
print("=" * 80)
print("LaTeX Rendering Diagnostic Tool")
print("=" * 80)
print(f"\nInput: {latex_input}")
print("-" * 80)
# Check if input contains problematic characters
print("\n1. Character Detection:")
if "\\lambda" in latex_input:
print(" ✅ Found \\lambda")
if "\\vdots" in latex_input:
print(" ✅ Found \\vdots")
if "\\cdots" in latex_input:
print(" Found \\cdots (similar to \\vdots)")
if "\\ldots" in latex_input:
print(" Found \\ldots (similar to \\vdots)")
# Run postprocessing pipeline
print("\n2. Postprocessing Pipeline:")
result = _postprocess_markdown(latex_input)
if result["segments"]:
for i, seg in enumerate(result["segments"], 1):
print(f"\n Segment {i} ({seg['type']}):")
print(f" Original: {seg['original']}")
proc = seg["processing"]
# Check each stage for changes
if proc["original"] != proc["after_stage0_numbers"]:
print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}")
else:
print(f" ✅ Stage 0 (numbers): No change")
if proc["after_stage0_numbers"] != proc["after_stage1_commands"]:
print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}")
else:
print(f" ✅ Stage 1 (commands): No change")
if proc["after_stage1_commands"] != proc["after_stage2_differentials"]:
print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}")
else:
print(f" ✅ Stage 2 (differentials): No change")
print(f" Final: {proc['final']}")
else:
print(" No math segments found (not wrapped in $ or $$)")
print("\n3. Final Output:")
print(f" {result['final']}")
if result["changed"]:
print("\n ⚠️ WARNING: The input was modified during postprocessing!")
print(" This could be the cause of rendering issues.")
else:
print("\n ✅ No changes made during postprocessing.")
print(" If rendering fails, the issue is likely in:")
print(" - Pandoc conversion (LaTeX → MathML)")
print(" - Frontend rendering (MathJax/KaTeX)")
print("\n" + "=" * 80)
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Usage: python diagnose_latex_rendering.py "<latex_formula>"')
print("\nExamples:")
print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"')
print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"')
sys.exit(1)
latex_input = sys.argv[1]
diagnose(latex_input)

View File

@@ -27,11 +27,12 @@ dependencies = [
"paddlepaddle", "paddlepaddle",
"paddleocr[doc-parser]", "paddleocr[doc-parser]",
"safetensors", "safetensors",
"lxml>=5.0.0" "lxml>=5.0.0",
"openai",
] ]
[tool.uv.sources] # [tool.uv.sources]
paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" } # paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
[project.optional-dependencies] [project.optional-dependencies]
dev = [ dev = [

View File

@@ -1,233 +0,0 @@
"""Test for removing false heading markers from single-formula content.
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
This test verifies that the heading marker is correctly removed.
"""
import re
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
"""Remove false heading markers from single-formula content."""
if not markdown_content or not markdown_content.strip():
return markdown_content
lines = markdown_content.split('\n')
# Count formulas and heading lines
formula_count = 0
heading_lines = []
has_non_formula_text = False
for i, line in enumerate(lines):
line_stripped = line.strip()
if not line_stripped:
continue
# Check if line starts with heading marker
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
if heading_match:
heading_level = heading_match.group(1)
content = heading_match.group(2)
# Check if the heading content is a formula
if re.fullmatch(r'\$\$?.+\$\$?', content):
# This is a heading with a formula
heading_lines.append((i, heading_level, content))
formula_count += 1
else:
# This is a real heading with text
has_non_formula_text = True
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
# Standalone formula line (not in a heading)
formula_count += 1
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
# Non-empty, non-heading, non-formula line
has_non_formula_text = True
# Only remove heading markers if:
# 1. There's exactly one formula
# 2. That formula is in a heading line
# 3. There's no other text content
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
# Remove the heading marker from the formula
line_idx, heading_level, formula_content = heading_lines[0]
lines[line_idx] = formula_content
return '\n'.join(lines)
# Test cases
test_cases = [
# Should remove heading marker (single formula with heading)
(
"# $$E = mc^2$$",
"$$E = mc^2$$",
"Single display formula with heading"
),
(
"# $x = y$",
"$x = y$",
"Single inline formula with heading"
),
(
"## $$\\frac{a}{b}$$",
"$$\\frac{a}{b}$$",
"Single formula with level-2 heading"
),
(
"### $$\\lambda_{1}$$",
"$$\\lambda_{1}$$",
"Single formula with level-3 heading"
),
# Should NOT remove heading marker (has text content)
(
"# Introduction\n$$E = mc^2$$",
"# Introduction\n$$E = mc^2$$",
"Heading with text + formula (keep heading)"
),
(
"# Title\nSome text\n$$E = mc^2$$",
"# Title\nSome text\n$$E = mc^2$$",
"Heading + text + formula (keep heading)"
),
(
"$$E = mc^2$$\n# Summary",
"$$E = mc^2$$\n# Summary",
"Formula + heading with text (keep heading)"
),
# Should NOT remove heading marker (multiple formulas)
(
"# $$x = y$$\n$$a = b$$",
"# $$x = y$$\n$$a = b$$",
"Multiple formulas (keep heading)"
),
(
"$$x = y$$\n# $$a = b$$",
"$$x = y$$\n# $$a = b$$",
"Two formulas, one with heading (keep heading)"
),
# Should NOT remove heading marker (standalone formula without heading)
(
"$$E = mc^2$$",
"$$E = mc^2$$",
"Single formula without heading (no change)"
),
(
"$x = y$",
"$x = y$",
"Single inline formula without heading (no change)"
),
# Edge cases
(
"",
"",
"Empty string"
),
(
"# ",
"# ",
"Empty heading"
),
(
"#",
"#",
"Just hash symbol"
),
(
"# $$E = mc^2$$\n\n",
"$$E = mc^2$$\n\n",
"Formula with heading and trailing newlines"
),
(
"\n\n# $$E = mc^2$$",
"\n\n$$E = mc^2$$",
"Formula with heading and leading newlines"
),
# Complex formulas
(
"# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
"$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
"Complex integral formula with heading"
),
(
"# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
"$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
"Matrix formula with heading"
),
]
print("=" * 80)
print("Remove False Heading from Single Formula - Test")
print("=" * 80)
passed = 0
failed = 0
for i, (input_text, expected, description) in enumerate(test_cases, 1):
result = _remove_false_heading_from_single_formula(input_text)
if result == expected:
status = "✅ PASS"
passed += 1
else:
status = "❌ FAIL"
failed += 1
print(f"\n{status} Test {i}: {description}")
print(f" Input: {repr(input_text)}")
print(f" Expected: {repr(expected)}")
print(f" Got: {repr(result)}")
if result != expected:
print(f" >>> MISMATCH!")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total tests: {len(test_cases)}")
print(f"✅ Passed: {passed}")
print(f"❌ Failed: {failed}")
if failed == 0:
print("\n✅ All tests passed!")
else:
print(f"\n⚠️ {failed} test(s) failed")
print("\n" + "=" * 80)
print("KEY SCENARIOS")
print("=" * 80)
key_scenarios = [
("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"),
("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"),
("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"),
("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"),
]
print("\nBehavior Summary:")
for input_text, expected, explanation in key_scenarios:
result = _remove_false_heading_from_single_formula(input_text)
match = "" if result == expected else ""
print(f" {match} {explanation}")
print(f" {repr(input_text)}{repr(result)}")
print("\n" + "=" * 80)
print("DECISION LOGIC")
print("=" * 80)
print("""
Remove heading marker ONLY when ALL conditions are met:
1. ✅ Exactly ONE formula in the entire content
2. ✅ That formula is on a line starting with '#' (heading marker)
3. ✅ No other text content exists (only formula and empty lines)
Otherwise: Keep the heading marker as-is.
""")
print("=" * 80)