feat: add paddleocr-vl
This commit is contained in:
@@ -5,6 +5,7 @@ import numpy as np
|
|||||||
import cv2
|
import cv2
|
||||||
import requests
|
import requests
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
import base64
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
from paddleocr import PaddleOCRVL
|
from paddleocr import PaddleOCRVL
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -12,6 +13,7 @@ from app.services.layout_detector import LayoutDetector
|
|||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.converter import Converter
|
from app.services.converter import Converter
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
@@ -109,22 +111,22 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
|||||||
"""
|
"""
|
||||||
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
|
# Pattern 1: Spaces around _ and ^ (subscript/superscript operators)
|
||||||
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
|
# a _ {i} -> a_{i}, x ^ {2} -> x^{2}
|
||||||
expr = re.sub(r'\s*_\s*', '_', expr)
|
expr = re.sub(r"\s*_\s*", "_", expr)
|
||||||
expr = re.sub(r'\s*\^\s*', '^', expr)
|
expr = re.sub(r"\s*\^\s*", "^", expr)
|
||||||
|
|
||||||
# Pattern 2: Spaces inside braces that follow _ or ^
|
# Pattern 2: Spaces inside braces that follow _ or ^
|
||||||
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
# _{i 1} -> _{i1}, ^{2 3} -> ^{23}
|
||||||
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
# This is safe because spaces inside subscript/superscript braces are usually OCR errors
|
||||||
def clean_subscript_superscript_braces(match):
|
def clean_subscript_superscript_braces(match):
|
||||||
operator = match.group(1) # _ or ^
|
operator = match.group(1) # _ or ^
|
||||||
content = match.group(2) # content inside braces
|
content = match.group(2) # content inside braces
|
||||||
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
# Remove spaces but preserve LaTeX commands (e.g., \alpha, \beta)
|
||||||
# Only remove spaces between non-backslash characters
|
# Only remove spaces between non-backslash characters
|
||||||
cleaned = re.sub(r'(?<!\\)\s+(?!\\)', '', content)
|
cleaned = re.sub(r"(?<!\\)\s+(?!\\)", "", content)
|
||||||
return f"{operator}{{{cleaned}}}"
|
return f"{operator}{{{cleaned}}}"
|
||||||
|
|
||||||
# Match _{ ... } or ^{ ... }
|
# Match _{ ... } or ^{ ... }
|
||||||
expr = re.sub(r'([_^])\{([^}]+)\}', clean_subscript_superscript_braces, expr)
|
expr = re.sub(r"([_^])\{([^}]+)\}", clean_subscript_superscript_braces, expr)
|
||||||
|
|
||||||
# Pattern 3: Spaces inside \frac arguments
|
# Pattern 3: Spaces inside \frac arguments
|
||||||
# \frac { a } { b } -> \frac{a}{b}
|
# \frac { a } { b } -> \frac{a}{b}
|
||||||
@@ -134,18 +136,17 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
|
|||||||
denominator = match.group(2).strip()
|
denominator = match.group(2).strip()
|
||||||
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
||||||
|
|
||||||
expr = re.sub(r'\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}',
|
expr = re.sub(r"\\frac\s*\{\s*([^}]+?)\s*\}\s*\{\s*([^}]+?)\s*\}", clean_frac_braces, expr)
|
||||||
clean_frac_braces, expr)
|
|
||||||
|
|
||||||
# Pattern 4: Spaces after backslash in LaTeX commands
|
# Pattern 4: Spaces after backslash in LaTeX commands
|
||||||
# \ alpha -> \alpha, \ beta -> \beta
|
# \ alpha -> \alpha, \ beta -> \beta
|
||||||
expr = re.sub(r'\\\s+([a-zA-Z]+)', r'\\\1', expr)
|
expr = re.sub(r"\\\s+([a-zA-Z]+)", r"\\\1", expr)
|
||||||
|
|
||||||
# Pattern 5: Spaces before/after braces in general contexts (conservative)
|
# Pattern 5: Spaces before/after braces in general contexts (conservative)
|
||||||
# Only remove if the space is clearly wrong (e.g., after operators)
|
# Only remove if the space is clearly wrong (e.g., after operators)
|
||||||
# { x } in standalone context is kept as-is to avoid breaking valid spacing
|
# { x } in standalone context is kept as-is to avoid breaking valid spacing
|
||||||
# But after operators like \sqrt{ x } -> \sqrt{x}
|
# But after operators like \sqrt{ x } -> \sqrt{x}
|
||||||
expr = re.sub(r'(\\[a-zA-Z]+)\s*\{\s*', r'\1{', expr) # \sqrt { -> \sqrt{
|
expr = re.sub(r"(\\[a-zA-Z]+)\s*\{\s*", r"\1{", expr) # \sqrt { -> \sqrt{
|
||||||
|
|
||||||
return expr
|
return expr
|
||||||
|
|
||||||
@@ -208,17 +209,13 @@ def _normalize_differentials_contextaware(expr: str) -> str:
|
|||||||
"""
|
"""
|
||||||
# Pattern 1: After integral commands
|
# Pattern 1: After integral commands
|
||||||
# \int dx -> \int d x
|
# \int dx -> \int d x
|
||||||
integral_pattern = re.compile(
|
integral_pattern = re.compile(r"(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])")
|
||||||
r'(\\i+nt|\\oint)\s*([^\\]*?)\s*d([a-zA-Z])(?![a-zA-Z])'
|
expr = integral_pattern.sub(r"\1 \2 d \3", expr)
|
||||||
)
|
|
||||||
expr = integral_pattern.sub(r'\1 \2 d \3', expr)
|
|
||||||
|
|
||||||
# Pattern 2: In fraction denominators
|
# Pattern 2: In fraction denominators
|
||||||
# \frac{...}{dx} -> \frac{...}{d x}
|
# \frac{...}{dx} -> \frac{...}{d x}
|
||||||
frac_pattern = re.compile(
|
frac_pattern = re.compile(r"(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})")
|
||||||
r'(\\frac\{[^}]*\}\{[^}]*?)d([a-zA-Z])(?![a-zA-Z])([^}]*\})'
|
expr = frac_pattern.sub(r"\1d \2\3", expr)
|
||||||
)
|
|
||||||
expr = frac_pattern.sub(r'\1d \2\3', expr)
|
|
||||||
|
|
||||||
return expr
|
return expr
|
||||||
|
|
||||||
@@ -241,20 +238,20 @@ def _fix_ocr_number_errors(expr: str) -> str:
|
|||||||
"""
|
"""
|
||||||
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
# Fix pattern 1: "digit space digit(s). digit(s)" → "digit digit(s).digit(s)"
|
||||||
# Example: "2 2. 2" → "22.2"
|
# Example: "2 2. 2" → "22.2"
|
||||||
expr = re.sub(r'(\d)\s+(\d+)\.\s*(\d+)', r'\1\2.\3', expr)
|
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
|
||||||
|
|
||||||
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
# Fix pattern 2: "digit(s). space digit(s)" → "digit(s).digit(s)"
|
||||||
# Example: "22. 2" → "22.2"
|
# Example: "22. 2" → "22.2"
|
||||||
expr = re.sub(r'(\d+)\.\s+(\d+)', r'\1.\2', expr)
|
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
|
||||||
|
|
||||||
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
# Fix pattern 3: "digit space digit" (no decimal point, within same number context)
|
||||||
# Be careful: only merge if followed by decimal point or comma/end
|
# Be careful: only merge if followed by decimal point or comma/end
|
||||||
# Example: "1 5 0" → "150" when followed by comma or end
|
# Example: "1 5 0" → "150" when followed by comma or end
|
||||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*[,\)]|$)', r'\1\2', expr)
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
|
||||||
|
|
||||||
# Fix pattern 4: Multiple spaces in decimal numbers
|
# Fix pattern 4: Multiple spaces in decimal numbers
|
||||||
# Example: "2 2 . 2" → "22.2"
|
# Example: "2 2 . 2" → "22.2"
|
||||||
expr = re.sub(r'(\d)\s+(\d)(?=\s*\.)', r'\1\2', expr)
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
|
||||||
|
|
||||||
return expr
|
return expr
|
||||||
|
|
||||||
@@ -308,7 +305,7 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
|||||||
if not markdown_content or not markdown_content.strip():
|
if not markdown_content or not markdown_content.strip():
|
||||||
return markdown_content
|
return markdown_content
|
||||||
|
|
||||||
lines = markdown_content.split('\n')
|
lines = markdown_content.split("\n")
|
||||||
|
|
||||||
# Count formulas and heading lines
|
# Count formulas and heading lines
|
||||||
formula_count = 0
|
formula_count = 0
|
||||||
@@ -322,24 +319,24 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Check if line starts with heading marker
|
# Check if line starts with heading marker
|
||||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
|
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line_stripped)
|
||||||
|
|
||||||
if heading_match:
|
if heading_match:
|
||||||
heading_level = heading_match.group(1)
|
heading_level = heading_match.group(1)
|
||||||
content = heading_match.group(2)
|
content = heading_match.group(2)
|
||||||
|
|
||||||
# Check if the heading content is a formula
|
# Check if the heading content is a formula
|
||||||
if re.fullmatch(r'\$\$?.+\$\$?', content):
|
if re.fullmatch(r"\$\$?.+\$\$?", content):
|
||||||
# This is a heading with a formula
|
# This is a heading with a formula
|
||||||
heading_lines.append((i, heading_level, content))
|
heading_lines.append((i, heading_level, content))
|
||||||
formula_count += 1
|
formula_count += 1
|
||||||
else:
|
else:
|
||||||
# This is a real heading with text
|
# This is a real heading with text
|
||||||
has_non_formula_text = True
|
has_non_formula_text = True
|
||||||
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
|
elif re.fullmatch(r"\$\$?.+\$\$?", line_stripped):
|
||||||
# Standalone formula line (not in a heading)
|
# Standalone formula line (not in a heading)
|
||||||
formula_count += 1
|
formula_count += 1
|
||||||
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
|
elif line_stripped and not re.match(r"^#+\s*$", line_stripped):
|
||||||
# Non-empty, non-heading, non-formula line
|
# Non-empty, non-heading, non-formula line
|
||||||
has_non_formula_text = True
|
has_non_formula_text = True
|
||||||
|
|
||||||
@@ -352,7 +349,7 @@ def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
|||||||
line_idx, heading_level, formula_content = heading_lines[0]
|
line_idx, heading_level, formula_content = heading_lines[0]
|
||||||
lines[line_idx] = formula_content
|
lines[line_idx] = formula_content
|
||||||
|
|
||||||
return '\n'.join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
class OCRServiceBase(ABC):
|
class OCRServiceBase(ABC):
|
||||||
@@ -492,16 +489,87 @@ class MineruOCRService(OCRServiceBase):
|
|||||||
api_url: str = "http://127.0.0.1:8000/file_parse",
|
api_url: str = "http://127.0.0.1:8000/file_parse",
|
||||||
image_processor: Optional[ImageProcessor] = None,
|
image_processor: Optional[ImageProcessor] = None,
|
||||||
converter: Optional[Converter] = None,
|
converter: Optional[Converter] = None,
|
||||||
|
paddleocr_vl_url: str = "http://localhost:8000/v1",
|
||||||
):
|
):
|
||||||
"""Initialize Local API service.
|
"""Initialize Local API service.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
api_url: URL of the local file_parse API endpoint.
|
api_url: URL of the local file_parse API endpoint.
|
||||||
converter: Optional converter instance for format conversion.
|
converter: Optional converter instance for format conversion.
|
||||||
|
paddleocr_vl_url: URL of the PaddleOCR-VL vLLM server.
|
||||||
"""
|
"""
|
||||||
self.api_url = api_url
|
self.api_url = api_url
|
||||||
self.image_processor = image_processor
|
self.image_processor = image_processor
|
||||||
self.converter = converter
|
self.converter = converter
|
||||||
|
self.paddleocr_vl_url = paddleocr_vl_url
|
||||||
|
self.openai_client = OpenAI(api_key="EMPTY", base_url=paddleocr_vl_url, timeout=3600)
|
||||||
|
|
||||||
|
def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
|
||||||
|
"""Recognize formula using PaddleOCR-VL API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image as numpy array in BGR format.
|
||||||
|
prompt: Recognition prompt (default: "Formula Recognition:")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Recognized formula text (LaTeX format).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Encode image to base64
|
||||||
|
success, encoded_image = cv2.imencode(".png", image)
|
||||||
|
if not success:
|
||||||
|
raise RuntimeError("Failed to encode image")
|
||||||
|
|
||||||
|
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
||||||
|
image_url = f"data:image/png;base64,{image_base64}"
|
||||||
|
|
||||||
|
# Call OpenAI-compatible API
|
||||||
|
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
|
||||||
|
|
||||||
|
response = self.openai_client.chat.completions.create(
|
||||||
|
model="PaddlePaddle/PaddleOCR-VL",
|
||||||
|
messages=messages,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
|
||||||
|
|
||||||
|
def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
|
||||||
|
"""Extract image references from markdown and recognize formulas.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
markdown_content: Markdown content with potential image references.
|
||||||
|
original_image: Original input image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown content with formulas recognized by PaddleOCR-VL.
|
||||||
|
"""
|
||||||
|
# Pattern to match image references: 
|
||||||
|
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
|
||||||
|
|
||||||
|
if not image_pattern.search(markdown_content):
|
||||||
|
return markdown_content
|
||||||
|
|
||||||
|
try:
|
||||||
|
# For now, use the entire image for formula recognition
|
||||||
|
# TODO: Extract specific regions if image paths contain coordinates
|
||||||
|
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
|
||||||
|
|
||||||
|
# Replace image references with recognized formulas
|
||||||
|
# Wrap in display math delimiters if not already wrapped
|
||||||
|
if not formula_text.startswith("$$"):
|
||||||
|
formula_text = f"$${formula_text}$$"
|
||||||
|
|
||||||
|
markdown_content = image_pattern.sub(formula_text, markdown_content)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If formula recognition fails, keep original content
|
||||||
|
print(f"Warning: Formula recognition failed: {e}")
|
||||||
|
|
||||||
|
return markdown_content
|
||||||
|
|
||||||
def recognize(self, image: np.ndarray) -> dict:
|
def recognize(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize content using local file_parse API.
|
"""Recognize content using local file_parse API.
|
||||||
@@ -554,6 +622,11 @@ class MineruOCRService(OCRServiceBase):
|
|||||||
if "results" in result and "image" in result["results"]:
|
if "results" in result and "image" in result["results"]:
|
||||||
markdown_content = result["results"]["image"].get("md_content", "")
|
markdown_content = result["results"]["image"].get("md_content", "")
|
||||||
|
|
||||||
|
# Check if markdown contains formula image references
|
||||||
|
if "
|
||||||
|
|
||||||
# Apply postprocessing to fix OCR errors
|
# Apply postprocessing to fix OCR errors
|
||||||
markdown_content = _postprocess_markdown(markdown_content)
|
markdown_content = _postprocess_markdown(markdown_content)
|
||||||
|
|
||||||
|
|||||||
202
diagnose_latex_rendering.py
Normal file
202
diagnose_latex_rendering.py
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
"""Diagnostic tool for LaTeX rendering issues.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python diagnose_latex_rendering.py "\\lambda + \\vdots"
|
||||||
|
python diagnose_latex_rendering.py "$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Simulate the OCR postprocessing pipeline
|
||||||
|
_COMMANDS_NEED_SPACE = {
|
||||||
|
"cdot",
|
||||||
|
"times",
|
||||||
|
"div",
|
||||||
|
"pm",
|
||||||
|
"mp",
|
||||||
|
"int",
|
||||||
|
"iint",
|
||||||
|
"iiint",
|
||||||
|
"oint",
|
||||||
|
"sum",
|
||||||
|
"prod",
|
||||||
|
"lim",
|
||||||
|
"sin",
|
||||||
|
"cos",
|
||||||
|
"tan",
|
||||||
|
"cot",
|
||||||
|
"sec",
|
||||||
|
"csc",
|
||||||
|
"log",
|
||||||
|
"ln",
|
||||||
|
"exp",
|
||||||
|
"partial",
|
||||||
|
"nabla",
|
||||||
|
}
|
||||||
|
|
||||||
|
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||||
|
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
|
||||||
|
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
|
||||||
|
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_glued_command_token(token: str) -> str:
|
||||||
|
"""Split OCR-glued LaTeX command token by whitelist longest-prefix."""
|
||||||
|
if not token.startswith("\\"):
|
||||||
|
return token
|
||||||
|
|
||||||
|
body = token[1:]
|
||||||
|
if len(body) < 2:
|
||||||
|
return token
|
||||||
|
|
||||||
|
best = None
|
||||||
|
for i in range(1, len(body)):
|
||||||
|
prefix = body[:i]
|
||||||
|
if prefix in _COMMANDS_NEED_SPACE:
|
||||||
|
best = prefix
|
||||||
|
|
||||||
|
if not best:
|
||||||
|
return token
|
||||||
|
|
||||||
|
suffix = body[len(best) :]
|
||||||
|
if not suffix:
|
||||||
|
return token
|
||||||
|
|
||||||
|
return f"\\{best} {suffix}"
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_ocr_number_errors(expr: str) -> str:
|
||||||
|
"""Fix common OCR errors in LaTeX math expressions."""
|
||||||
|
expr = re.sub(r"(\d)\s+(\d+)\.\s*(\d+)", r"\1\2.\3", expr)
|
||||||
|
expr = re.sub(r"(\d+)\.\s+(\d+)", r"\1.\2", expr)
|
||||||
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*[,\)]|$)", r"\1\2", expr)
|
||||||
|
expr = re.sub(r"(\d)\s+(\d)(?=\s*\.)", r"\1\2", expr)
|
||||||
|
return expr
|
||||||
|
|
||||||
|
|
||||||
|
def _postprocess_math(expr: str) -> str:
|
||||||
|
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||||
|
original = expr
|
||||||
|
|
||||||
|
# Stage 0: fix OCR number errors
|
||||||
|
expr = _fix_ocr_number_errors(expr)
|
||||||
|
stage0 = expr
|
||||||
|
|
||||||
|
# Stage 1: split glued command tokens
|
||||||
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||||
|
stage1 = expr
|
||||||
|
|
||||||
|
# Stage 2: normalize differentials
|
||||||
|
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
|
||||||
|
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
|
||||||
|
stage2 = expr
|
||||||
|
|
||||||
|
return {"original": original, "after_stage0_numbers": stage0, "after_stage1_commands": stage1, "after_stage2_differentials": stage2, "final": expr}
|
||||||
|
|
||||||
|
|
||||||
|
def _postprocess_markdown(markdown_content: str) -> Dict[str, Any]:
|
||||||
|
"""Apply LaTeX postprocessing to markdown segments."""
|
||||||
|
if not markdown_content:
|
||||||
|
return {"original": markdown_content, "final": markdown_content, "segments": []}
|
||||||
|
|
||||||
|
segments = []
|
||||||
|
|
||||||
|
def _fix_segment(m: re.Match) -> str:
|
||||||
|
seg = m.group(0)
|
||||||
|
inner = None
|
||||||
|
|
||||||
|
if seg.startswith("$$") and seg.endswith("$$"):
|
||||||
|
inner = seg[2:-2]
|
||||||
|
result = _postprocess_math(inner)
|
||||||
|
segments.append({"type": "display", "original": seg, "processing": result})
|
||||||
|
return f"$${result['final']}$$"
|
||||||
|
elif seg.startswith("$") and seg.endswith("$"):
|
||||||
|
inner = seg[1:-1]
|
||||||
|
result = _postprocess_math(inner)
|
||||||
|
segments.append({"type": "inline", "original": seg, "processing": result})
|
||||||
|
return f"${result['final']}$"
|
||||||
|
|
||||||
|
return seg
|
||||||
|
|
||||||
|
final = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
||||||
|
|
||||||
|
return {"original": markdown_content, "final": final, "segments": segments, "changed": markdown_content != final}
|
||||||
|
|
||||||
|
|
||||||
|
def diagnose(latex_input: str) -> None:
|
||||||
|
"""Run diagnostic on LaTeX input."""
|
||||||
|
print("=" * 80)
|
||||||
|
print("LaTeX Rendering Diagnostic Tool")
|
||||||
|
print("=" * 80)
|
||||||
|
print(f"\nInput: {latex_input}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
# Check if input contains problematic characters
|
||||||
|
print("\n1. Character Detection:")
|
||||||
|
if "\\lambda" in latex_input:
|
||||||
|
print(" ✅ Found \\lambda")
|
||||||
|
if "\\vdots" in latex_input:
|
||||||
|
print(" ✅ Found \\vdots")
|
||||||
|
if "\\cdots" in latex_input:
|
||||||
|
print(" ℹ️ Found \\cdots (similar to \\vdots)")
|
||||||
|
if "\\ldots" in latex_input:
|
||||||
|
print(" ℹ️ Found \\ldots (similar to \\vdots)")
|
||||||
|
|
||||||
|
# Run postprocessing pipeline
|
||||||
|
print("\n2. Postprocessing Pipeline:")
|
||||||
|
result = _postprocess_markdown(latex_input)
|
||||||
|
|
||||||
|
if result["segments"]:
|
||||||
|
for i, seg in enumerate(result["segments"], 1):
|
||||||
|
print(f"\n Segment {i} ({seg['type']}):")
|
||||||
|
print(f" Original: {seg['original']}")
|
||||||
|
|
||||||
|
proc = seg["processing"]
|
||||||
|
|
||||||
|
# Check each stage for changes
|
||||||
|
if proc["original"] != proc["after_stage0_numbers"]:
|
||||||
|
print(f" ⚠️ Stage 0 (numbers): {proc['after_stage0_numbers']}")
|
||||||
|
else:
|
||||||
|
print(f" ✅ Stage 0 (numbers): No change")
|
||||||
|
|
||||||
|
if proc["after_stage0_numbers"] != proc["after_stage1_commands"]:
|
||||||
|
print(f" ⚠️ Stage 1 (commands): {proc['after_stage1_commands']}")
|
||||||
|
else:
|
||||||
|
print(f" ✅ Stage 1 (commands): No change")
|
||||||
|
|
||||||
|
if proc["after_stage1_commands"] != proc["after_stage2_differentials"]:
|
||||||
|
print(f" ⚠️ Stage 2 (differentials): {proc['after_stage2_differentials']}")
|
||||||
|
else:
|
||||||
|
print(f" ✅ Stage 2 (differentials): No change")
|
||||||
|
|
||||||
|
print(f" Final: {proc['final']}")
|
||||||
|
else:
|
||||||
|
print(" ℹ️ No math segments found (not wrapped in $ or $$)")
|
||||||
|
|
||||||
|
print("\n3. Final Output:")
|
||||||
|
print(f" {result['final']}")
|
||||||
|
|
||||||
|
if result["changed"]:
|
||||||
|
print("\n ⚠️ WARNING: The input was modified during postprocessing!")
|
||||||
|
print(" This could be the cause of rendering issues.")
|
||||||
|
else:
|
||||||
|
print("\n ✅ No changes made during postprocessing.")
|
||||||
|
print(" If rendering fails, the issue is likely in:")
|
||||||
|
print(" - Pandoc conversion (LaTeX → MathML)")
|
||||||
|
print(" - Frontend rendering (MathJax/KaTeX)")
|
||||||
|
|
||||||
|
print("\n" + "=" * 80)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print('Usage: python diagnose_latex_rendering.py "<latex_formula>"')
|
||||||
|
print("\nExamples:")
|
||||||
|
print(' python diagnose_latex_rendering.py "$\\lambda + \\vdots$"')
|
||||||
|
print(' python diagnose_latex_rendering.py "$$\\lambda_1, \\lambda_2, \\vdots, \\lambda_n$$"')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
latex_input = sys.argv[1]
|
||||||
|
diagnose(latex_input)
|
||||||
@@ -27,11 +27,12 @@ dependencies = [
|
|||||||
"paddlepaddle",
|
"paddlepaddle",
|
||||||
"paddleocr[doc-parser]",
|
"paddleocr[doc-parser]",
|
||||||
"safetensors",
|
"safetensors",
|
||||||
"lxml>=5.0.0"
|
"lxml>=5.0.0",
|
||||||
|
"openai",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.uv.sources]
|
# [tool.uv.sources]
|
||||||
paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
|
# paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" }
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
dev = [
|
dev = [
|
||||||
|
|||||||
@@ -1,233 +0,0 @@
|
|||||||
"""Test for removing false heading markers from single-formula content.
|
|
||||||
|
|
||||||
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
|
|
||||||
This test verifies that the heading marker is correctly removed.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
|
||||||
"""Remove false heading markers from single-formula content."""
|
|
||||||
if not markdown_content or not markdown_content.strip():
|
|
||||||
return markdown_content
|
|
||||||
|
|
||||||
lines = markdown_content.split('\n')
|
|
||||||
|
|
||||||
# Count formulas and heading lines
|
|
||||||
formula_count = 0
|
|
||||||
heading_lines = []
|
|
||||||
has_non_formula_text = False
|
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
line_stripped = line.strip()
|
|
||||||
|
|
||||||
if not line_stripped:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if line starts with heading marker
|
|
||||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
|
|
||||||
|
|
||||||
if heading_match:
|
|
||||||
heading_level = heading_match.group(1)
|
|
||||||
content = heading_match.group(2)
|
|
||||||
|
|
||||||
# Check if the heading content is a formula
|
|
||||||
if re.fullmatch(r'\$\$?.+\$\$?', content):
|
|
||||||
# This is a heading with a formula
|
|
||||||
heading_lines.append((i, heading_level, content))
|
|
||||||
formula_count += 1
|
|
||||||
else:
|
|
||||||
# This is a real heading with text
|
|
||||||
has_non_formula_text = True
|
|
||||||
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
|
|
||||||
# Standalone formula line (not in a heading)
|
|
||||||
formula_count += 1
|
|
||||||
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
|
|
||||||
# Non-empty, non-heading, non-formula line
|
|
||||||
has_non_formula_text = True
|
|
||||||
|
|
||||||
# Only remove heading markers if:
|
|
||||||
# 1. There's exactly one formula
|
|
||||||
# 2. That formula is in a heading line
|
|
||||||
# 3. There's no other text content
|
|
||||||
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
|
|
||||||
# Remove the heading marker from the formula
|
|
||||||
line_idx, heading_level, formula_content = heading_lines[0]
|
|
||||||
lines[line_idx] = formula_content
|
|
||||||
|
|
||||||
return '\n'.join(lines)
|
|
||||||
|
|
||||||
|
|
||||||
# Test cases
|
|
||||||
test_cases = [
|
|
||||||
# Should remove heading marker (single formula with heading)
|
|
||||||
(
|
|
||||||
"# $$E = mc^2$$",
|
|
||||||
"$$E = mc^2$$",
|
|
||||||
"Single display formula with heading"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"# $x = y$",
|
|
||||||
"$x = y$",
|
|
||||||
"Single inline formula with heading"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"## $$\\frac{a}{b}$$",
|
|
||||||
"$$\\frac{a}{b}$$",
|
|
||||||
"Single formula with level-2 heading"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"### $$\\lambda_{1}$$",
|
|
||||||
"$$\\lambda_{1}$$",
|
|
||||||
"Single formula with level-3 heading"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Should NOT remove heading marker (has text content)
|
|
||||||
(
|
|
||||||
"# Introduction\n$$E = mc^2$$",
|
|
||||||
"# Introduction\n$$E = mc^2$$",
|
|
||||||
"Heading with text + formula (keep heading)"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"# Title\nSome text\n$$E = mc^2$$",
|
|
||||||
"# Title\nSome text\n$$E = mc^2$$",
|
|
||||||
"Heading + text + formula (keep heading)"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"$$E = mc^2$$\n# Summary",
|
|
||||||
"$$E = mc^2$$\n# Summary",
|
|
||||||
"Formula + heading with text (keep heading)"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Should NOT remove heading marker (multiple formulas)
|
|
||||||
(
|
|
||||||
"# $$x = y$$\n$$a = b$$",
|
|
||||||
"# $$x = y$$\n$$a = b$$",
|
|
||||||
"Multiple formulas (keep heading)"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"$$x = y$$\n# $$a = b$$",
|
|
||||||
"$$x = y$$\n# $$a = b$$",
|
|
||||||
"Two formulas, one with heading (keep heading)"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Should NOT remove heading marker (standalone formula without heading)
|
|
||||||
(
|
|
||||||
"$$E = mc^2$$",
|
|
||||||
"$$E = mc^2$$",
|
|
||||||
"Single formula without heading (no change)"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"$x = y$",
|
|
||||||
"$x = y$",
|
|
||||||
"Single inline formula without heading (no change)"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Edge cases
|
|
||||||
(
|
|
||||||
"",
|
|
||||||
"",
|
|
||||||
"Empty string"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"# ",
|
|
||||||
"# ",
|
|
||||||
"Empty heading"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"#",
|
|
||||||
"#",
|
|
||||||
"Just hash symbol"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"# $$E = mc^2$$\n\n",
|
|
||||||
"$$E = mc^2$$\n\n",
|
|
||||||
"Formula with heading and trailing newlines"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"\n\n# $$E = mc^2$$",
|
|
||||||
"\n\n$$E = mc^2$$",
|
|
||||||
"Formula with heading and leading newlines"
|
|
||||||
),
|
|
||||||
|
|
||||||
# Complex formulas
|
|
||||||
(
|
|
||||||
"# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
|
|
||||||
"$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
|
|
||||||
"Complex integral formula with heading"
|
|
||||||
),
|
|
||||||
(
|
|
||||||
"# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
|
|
||||||
"$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
|
|
||||||
"Matrix formula with heading"
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
print("Remove False Heading from Single Formula - Test")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
passed = 0
|
|
||||||
failed = 0
|
|
||||||
|
|
||||||
for i, (input_text, expected, description) in enumerate(test_cases, 1):
|
|
||||||
result = _remove_false_heading_from_single_formula(input_text)
|
|
||||||
|
|
||||||
if result == expected:
|
|
||||||
status = "✅ PASS"
|
|
||||||
passed += 1
|
|
||||||
else:
|
|
||||||
status = "❌ FAIL"
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
print(f"\n{status} Test {i}: {description}")
|
|
||||||
print(f" Input: {repr(input_text)}")
|
|
||||||
print(f" Expected: {repr(expected)}")
|
|
||||||
print(f" Got: {repr(result)}")
|
|
||||||
if result != expected:
|
|
||||||
print(f" >>> MISMATCH!")
|
|
||||||
|
|
||||||
print("\n" + "=" * 80)
|
|
||||||
print("SUMMARY")
|
|
||||||
print("=" * 80)
|
|
||||||
print(f"Total tests: {len(test_cases)}")
|
|
||||||
print(f"✅ Passed: {passed}")
|
|
||||||
print(f"❌ Failed: {failed}")
|
|
||||||
|
|
||||||
if failed == 0:
|
|
||||||
print("\n✅ All tests passed!")
|
|
||||||
else:
|
|
||||||
print(f"\n⚠️ {failed} test(s) failed")
|
|
||||||
|
|
||||||
print("\n" + "=" * 80)
|
|
||||||
print("KEY SCENARIOS")
|
|
||||||
print("=" * 80)
|
|
||||||
|
|
||||||
key_scenarios = [
|
|
||||||
("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"),
|
|
||||||
("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"),
|
|
||||||
("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"),
|
|
||||||
("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"),
|
|
||||||
]
|
|
||||||
|
|
||||||
print("\nBehavior Summary:")
|
|
||||||
for input_text, expected, explanation in key_scenarios:
|
|
||||||
result = _remove_false_heading_from_single_formula(input_text)
|
|
||||||
match = "✓" if result == expected else "✗"
|
|
||||||
print(f" {match} {explanation}")
|
|
||||||
print(f" {repr(input_text)} → {repr(result)}")
|
|
||||||
|
|
||||||
print("\n" + "=" * 80)
|
|
||||||
print("DECISION LOGIC")
|
|
||||||
print("=" * 80)
|
|
||||||
print("""
|
|
||||||
Remove heading marker ONLY when ALL conditions are met:
|
|
||||||
1. ✅ Exactly ONE formula in the entire content
|
|
||||||
2. ✅ That formula is on a line starting with '#' (heading marker)
|
|
||||||
3. ✅ No other text content exists (only formula and empty lines)
|
|
||||||
|
|
||||||
Otherwise: Keep the heading marker as-is.
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("=" * 80)
|
|
||||||
Reference in New Issue
Block a user