Compare commits

...

8 Commits

Author SHA1 Message Date
liuyuanchuang
e31017cfe7 fix: add preprocess 2026-02-04 12:45:34 +08:00
liuyuanchuang
69f9a70ae5 feat: add omml api 2026-02-04 12:35:14 +08:00
liuyuanchuang
27f25d9f4d feat: update port config 2026-02-04 12:06:17 +08:00
liuyuanchuang
526c1f3a0d feat: optimize the format convert 2026-02-04 12:00:06 +08:00
10dbd59161 fix: matrix not rendor in docx 2026-01-14 14:18:00 +08:00
df2b664af4 fix: add image padding for mineru 2026-01-05 21:37:51 +08:00
6ea37c9380 feat: add mineru model 2026-01-05 17:30:54 +08:00
3870c108b2 fix: image alpha error 2026-01-01 23:38:52 +08:00
16 changed files with 1330 additions and 135 deletions

View File

@@ -1,10 +1,10 @@
"""Markdown to DOCX conversion endpoint."""
"""Format conversion endpoints."""
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import Response
from app.core.dependencies import get_converter
from app.schemas.convert import MarkdownToDocxRequest
from app.schemas.convert import MarkdownToDocxRequest, LatexToOmmlRequest, LatexToOmmlResponse
from app.services.converter import Converter
router = APIRouter()
@@ -28,3 +28,39 @@ async def convert_markdown_to_docx(
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
async def convert_latex_to_omml(
request: LatexToOmmlRequest,
converter: Converter = Depends(get_converter),
) -> LatexToOmmlResponse:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
OMML is the math format used by Microsoft Word and other Office applications.
This endpoint is separate from the main OCR endpoint due to the performance
overhead of OMML conversion (requires creating a temporary DOCX file).
Args:
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
Returns:
OMML representation of the formula.
Example:
```bash
curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \\
-H "Content-Type: application/json" \\
-d '{"latex": "\\\\frac{a}{b} + \\\\sqrt{c}"}'
```
"""
if not request.latex or not request.latex.strip():
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
try:
omml = converter.convert_to_omml(request.latex)
return LatexToOmmlResponse(omml=omml)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))

View File

@@ -2,11 +2,11 @@
from fastapi import APIRouter, Depends, HTTPException
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service
from app.schemas.image import ImageOCRRequest, ImageOCRResponse
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService
from app.services.ocr_service import OCRService, MineruOCRService
router = APIRouter()
@@ -16,7 +16,8 @@ async def process_image_ocr(
request: ImageOCRRequest,
image_processor: ImageProcessor = Depends(get_image_processor),
layout_detector: LayoutDetector = Depends(get_layout_detector),
ocr_service: OCRService = Depends(get_ocr_service),
mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
paddle_service: OCRService = Depends(get_ocr_service),
) -> ImageOCRResponse:
"""Process an image and extract content as LaTeX, Markdown, and MathML.
@@ -27,6 +28,9 @@ async def process_image_ocr(
- If plain text exists: use PP-DocLayoutV2 for mixed recognition
- Otherwise: use PaddleOCR-VL with formula prompt
4. Convert output to LaTeX, Markdown, and MathML formats
Note: OMML conversion is not included due to performance overhead.
Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately.
"""
image = image_processor.preprocess(
@@ -35,14 +39,18 @@ async def process_image_ocr(
)
try:
# 3. Perform OCR based on layout
ocr_result = ocr_service.recognize(image)
if request.model_name == "mineru":
ocr_result = mineru_service.recognize(image)
elif request.model_name == "paddle":
ocr_result = paddle_service.recognize(image)
else:
raise HTTPException(status_code=400, detail="Invalid model name")
except RuntimeError as e:
raise HTTPException(status_code=503, detail=str(e))
# 4. Return response
return ImageOCRResponse(
latex=ocr_result.get("latex", ""),
markdown=ocr_result.get("markdown", ""),
mathml=ocr_result.get("mathml", ""),
mml=ocr_result.get("mml", ""),
)

View File

@@ -24,6 +24,9 @@ class Settings(BaseSettings):
# PaddleOCR-VL Settings
paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
# MinerOCR Settings
miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse"
# Model Paths
pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"

View File

@@ -2,7 +2,7 @@
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
from app.services.ocr_service import OCRService
from app.services.ocr_service import OCRService, MineruOCRService
from app.services.converter import Converter
from app.core.config import get_settings
@@ -45,3 +45,14 @@ def get_converter() -> Converter:
"""Get a DOCX converter instance."""
return Converter()
def get_mineru_ocr_service() -> MineruOCRService:
"""Get a MinerOCR service instance."""
settings = get_settings()
api_url = getattr(settings, 'miner_ocr_api_url', 'http://127.0.0.1:8000/file_parse')
return MineruOCRService(
api_url=api_url,
converter=get_converter(),
image_processor=get_image_processor(),
)

View File

@@ -37,9 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix)
async def health_check():
"""Health check endpoint."""
return {"status": "healthy"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8053)
uvicorn.run(app, host="0.0.0.0", port=settings.port)

View File

@@ -1,4 +1,4 @@
"""Request and response schemas for markdown to DOCX conversion endpoint."""
"""Request and response schemas for format conversion endpoints."""
from pydantic import BaseModel, Field, field_validator
@@ -17,3 +17,23 @@ class MarkdownToDocxRequest(BaseModel):
raise ValueError("Markdown content cannot be empty")
return v
class LatexToOmmlRequest(BaseModel):
"""Request body for LaTeX to OMML conversion endpoint."""
latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
@field_validator("latex")
@classmethod
def validate_latex_not_empty(cls, v: str) -> str:
"""Validate that LaTeX formula is not empty."""
if not v or not v.strip():
raise ValueError("LaTeX formula cannot be empty")
return v
class LatexToOmmlResponse(BaseModel):
"""Response body for LaTeX to OMML conversion endpoint."""
omml: str = Field("", description="OMML (Office Math Markup Language) representation")

View File

@@ -25,6 +25,7 @@ class ImageOCRRequest(BaseModel):
image_url: str | None = Field(None, description="URL to fetch the image from")
image_base64: str | None = Field(None, description="Base64-encoded image data")
model_name: str = Field("mineru", description="Name of the model to use for OCR")
@model_validator(mode="after")
def validate_input(self):
@@ -39,11 +40,10 @@ class ImageOCRRequest(BaseModel):
class ImageOCRResponse(BaseModel):
"""Response body for image OCR endpoint."""
latex: str = Field("", description="LaTeX representation of the content")
latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)")
markdown: str = Field("", description="Markdown representation of the content")
mathml: str = Field("", description="MathML representation (empty if no math detected)")
mathml: str = Field("", description="Standard MathML representation (empty if mixed content)")
mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)")
layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
recognition_mode: str = Field(
"", description="Recognition mode used: mixed_recognition or formula_recognition"
)
recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")

View File

@@ -4,17 +4,29 @@ import os
import re
import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Literal
import pypandoc
from latex2mathml.converter import convert as latex_to_mathml
@dataclass
class ConvertResult:
"""Result of markdown conversion."""
"""Result of markdown conversion.
Only populated when input contains pure LaTeX formula.
All fields are empty strings when input contains mixed content (text + formula).
Attributes:
latex: Pure LaTeX formula code (without delimiters).
mathml: Standard MathML format.
mml: XML MathML with mml: namespace prefix (mml:math).
"""
latex: str
mathml: str
mml: str
@dataclass
@@ -28,59 +40,430 @@ class ExportResult:
ExportType = Literal["docx", "pdf"]
# MathML namespace
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
# XSLT for MathML to mml: namespace conversion
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
xmlns:m="http://www.w3.org/1998/Math/MathML"
exclude-result-prefixes="m">
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- Match root math element -->
<xsl:template match="m:math|math">
<mml:math>
<xsl:apply-templates select="@*|node()"/>
</mml:math>
</xsl:template>
<!-- Match all other MathML elements -->
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
<xsl:element name="mml:{local-name()}">
<xsl:apply-templates select="@*|node()"/>
</xsl:element>
</xsl:template>
<!-- Copy attributes -->
<xsl:template match="@*">
<xsl:if test="local-name() != 'xmlns'">
<xsl:copy/>
</xsl:if>
</xsl:template>
<!-- Copy text nodes -->
<xsl:template match="text()">
<xsl:value-of select="."/>
</xsl:template>
</xsl:stylesheet>
"""
class Converter:
"""Service for conversion and export operations."""
"""Service for conversion and export operations.
Conversion rules:
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
- Mixed content (text + formula) returns empty results for all formats.
- OMML conversion is provided as a separate method due to performance overhead.
Performance optimizations:
- Pre-compiled regex patterns
- XSLT-based MML conversion
- Cached XSLT transforms
- Direct Pandoc OMML output (avoids DOCX parsing)
"""
# Pandoc input format with LaTeX math extensions
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
# Pre-compiled regex patterns for formula detection
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
# Cached XSLT transform
_mml_xslt_transform = None
def __init__(self):
"""Initialize converter."""
@classmethod
def _get_mml_xslt_transform(cls):
"""Get cached XSLT transform for MathML to mml: conversion."""
if cls._mml_xslt_transform is None:
from lxml import etree
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
return cls._mml_xslt_transform
def _is_formula_only(self, text: str) -> bool:
"""Check if text contains only a LaTeX formula (no mixed content).
A text is considered formula-only if it matches one of these patterns:
- Display math: $$...$$ or \\[...\\]
- Inline math: $...$ or \\(...\\)
Args:
text: Input text to check.
Returns:
True if the text contains only a LaTeX formula, False otherwise.
"""
text = text.strip()
if not text:
return False
# Strict patterns: entire text must be a single formula with delimiters
# Using pre-compiled patterns with fullmatch semantics
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
return True
if self._RE_DISPLAY_BRACKET.fullmatch(text):
return True
if self._RE_INLINE_DOLLAR.fullmatch(text):
return True
if self._RE_INLINE_PAREN.fullmatch(text):
return True
return False
def convert_to_formats(self, md_text: str) -> ConvertResult:
"""Convert markdown to LaTeX and MathML formats.
"""Convert markdown to LaTeX, MathML, and MML formats.
Only converts when input contains a pure LaTeX formula.
Mixed content (text + formula) returns empty strings for all fields.
Args:
md_text: Markdown text to convert.
Returns:
ConvertResult with latex and mathml fields.
ConvertResult with latex, mathml, and mml fields.
All fields are empty if input is not a pure formula.
Raises:
ValueError: If md_text is empty.
RuntimeError: If conversion fails.
RuntimeError: If conversion fails for a valid formula.
"""
if md_text == "":
return ConvertResult(latex="", mathml="")
# Empty input returns empty result
if not md_text or not md_text.strip():
return ConvertResult(latex="", mathml="", mml="")
# Check if input is formula-only
if not self._is_formula_only(md_text):
# Mixed content: cannot convert to formula formats
return ConvertResult(latex="", mathml="", mml="")
try:
# Convert to LaTeX
latex_output = pypandoc.convert_text(
md_text,
"latex",
format=self.INPUT_FORMAT,
).rstrip("\n")
# Extract the LaTeX formula content (remove delimiters)
latex_formula = self._extract_latex_formula(md_text)
# Convert to HTML with MathML
mathml_output = pypandoc.convert_text(
md_text,
"html",
format=self.INPUT_FORMAT,
extra_args=["--mathml"],
).rstrip("\n")
# Convert to MathML
mathml = self._latex_to_mathml(latex_formula)
return ConvertResult(latex=latex_output, mathml=mathml_output)
# Convert MathML to mml:math format (with namespace prefix)
mml = self._mathml_to_mml(mathml)
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
except Exception as e:
raise RuntimeError(f"Conversion failed: {e}") from e
def convert_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
This is a separate method due to the performance overhead of OMML conversion,
which requires creating a temporary DOCX file.
The formula is preprocessed using the same logic as export_to_file to ensure
proper conversion.
Args:
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
Returns:
OMML representation as XML string.
Raises:
ValueError: If latex_formula is empty.
RuntimeError: If conversion fails.
"""
if not latex_formula or not latex_formula.strip():
raise ValueError("LaTeX formula cannot be empty")
# Preprocess formula using the same preprocessing as export
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
return self._latex_to_omml(preprocessed)
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
"""Preprocess LaTeX formula for OMML conversion.
Applies the same preprocessing steps as preprocess_for_export to ensure
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
Args:
latex_formula: Pure LaTeX formula.
Returns:
Preprocessed LaTeX formula.
"""
# Use the same preprocessing methods as export
# 1. Convert matrix environments
latex_formula = self._convert_matrix_environments(latex_formula)
# 2. Fix array column specifiers (remove spaces)
latex_formula = self._fix_array_column_specifiers(latex_formula)
# 3. Fix brace spacing
latex_formula = self._fix_brace_spacing(latex_formula)
# 4. Convert special environments (cases, aligned)
latex_formula = self._convert_special_environments(latex_formula)
return latex_formula
def _extract_latex_formula(self, text: str) -> str:
"""Extract LaTeX formula from text by removing delimiters.
Args:
text: Text containing LaTeX formula with delimiters.
Returns:
Pure LaTeX formula without delimiters.
"""
text = text.strip()
# Remove display math delimiters: $$...$$ or \[...\]
if text.startswith("$$") and text.endswith("$$"):
return text[2:-2].strip()
if text.startswith("\\[") and text.endswith("\\]"):
return text[2:-2].strip()
# Remove inline math delimiters: $...$ or \(...\)
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
return text[1:-1].strip()
if text.startswith("\\(") and text.endswith("\\)"):
return text[2:-2].strip()
# If no delimiters, return as-is
return text.strip()
@staticmethod
@lru_cache(maxsize=256)
def _latex_to_mathml_cached(latex_formula: str) -> str:
"""Cached conversion of LaTeX formula to MathML.
Uses LRU cache to avoid recomputing for repeated formulas.
"""
try:
# Use latex2mathml library for conversion (fast, pure Python)
return latex_to_mathml(latex_formula)
except Exception as e:
# Fallback: try with Pandoc (slower, but more robust)
try:
mathml_html = pypandoc.convert_text(
f"${latex_formula}$",
"html",
format="markdown+tex_math_dollars",
extra_args=["--mathml"],
)
# Extract just the <math> element from the HTML
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
if match:
return match.group(0)
return mathml_html.rstrip("\n")
except Exception as pandoc_error:
raise RuntimeError(
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
) from e
def _latex_to_mathml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to standard MathML.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
Returns:
Standard MathML representation.
"""
return self._latex_to_mathml_cached(latex_formula)
def _mathml_to_mml(self, mathml: str) -> str:
"""Convert standard MathML to mml:math format with namespace prefix.
Uses XSLT for efficient transformation. Transforms:
- <math ...> to <mml:math xmlns:mml="..." ...>
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
Args:
mathml: Standard MathML string.
Returns:
MathML with mml: namespace prefix.
"""
if not mathml:
return ""
try:
from lxml import etree
# Parse MathML
root = etree.fromstring(mathml.encode("utf-8"))
# Apply XSLT transformation (cached)
transform = self._get_mml_xslt_transform()
result_tree = transform(root)
# Serialize to string
return str(result_tree)
except Exception:
# Fallback: simple string replacement (less robust but no lxml dependency)
result = mathml
# Add namespace to root math element
result = re.sub(
r"<math\b",
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
result,
)
result = re.sub(r"</math>", "</mml:math>", result)
# Add mml: prefix to all other elements using a single regex
# Match opening tags
result = re.sub(
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)\b",
r"<mml:\1",
result,
)
# Match closing tags
result = re.sub(
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
r"maction|semantics|annotation|annotation-xml)>",
r"</mml:\1>",
result,
)
return result
def _latex_to_omml(self, latex_formula: str) -> str:
"""Convert LaTeX formula to OMML (Office Math Markup Language).
Uses Pandoc to create DOCX in memory and extracts OMML from it.
Optimized to minimize disk I/O by using in-memory zip processing.
Args:
latex_formula: Pure LaTeX formula (without delimiters).
Returns:
OMML representation as XML string.
"""
import io
import zipfile
try:
from lxml import etree
# Convert to DOCX bytes using Pandoc
# We still need a temp file for input, but output goes to temp file too
# Then we process the DOCX in memory
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(f"$${latex_formula}$$\n")
temp_md = f.name
temp_docx = temp_md.replace(".md", ".docx")
try:
pypandoc.convert_file(
temp_md,
"docx",
format=self.INPUT_FORMAT,
outputfile=temp_docx,
)
# Read DOCX into memory and process as ZIP
with open(temp_docx, "rb") as f:
docx_bytes = f.read()
# Extract document.xml from DOCX (which is a ZIP file)
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
document_xml = zf.read("word/document.xml")
# Parse XML and extract OMML
root = etree.fromstring(document_xml)
# Find all oMath elements
omml_parts = []
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
omml_parts.append(etree.tostring(math, encoding="unicode"))
return "\n".join(omml_parts)
finally:
# Cleanup temp files
if os.path.exists(temp_md):
os.remove(temp_md)
if os.path.exists(temp_docx):
os.remove(temp_docx)
except Exception as e:
raise RuntimeError(f"OMML conversion failed: {e}") from e
def preprocess_for_export(self, md_text: str) -> str:
"""Preprocess markdown text for export to docx/pdf.
Handles LaTeX formula formatting, matrix environments, and
other transformations needed for proper Word/PDF rendering.
Uses pre-compiled regex patterns for better performance.
Args:
md_text: Raw markdown text.
@@ -88,46 +471,39 @@ class Converter:
Preprocessed markdown text.
"""
# Replace \[1mm] => \vspace{1mm}
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
# Add blank lines around \[...\] block formulas
md_text = re.sub(
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
r"\1\n\n\\[\3\\]\n\n\4",
md_text,
flags=re.DOTALL,
)
md_text = re.sub(
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
r"\n\\[\2\\]\n",
md_text,
flags=re.MULTILINE | re.DOTALL,
)
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
# Remove arithmatex span wrappers
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
# Convert inline formulas: \( \) => $ $
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
# Convert block formulas: \[ \] => $$ $$
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
# Remove spaces between $ and formula content
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
# Convert matrix environments for better Word rendering
cleaned_md = self._convert_matrix_environments(cleaned_md)
# Fix array environment column specifiers (remove spaces)
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
# Fix brace spacing for equation systems
cleaned_md = self._fix_brace_spacing(cleaned_md)
# Convert cases and aligned environments
cleaned_md = self._convert_special_environments(cleaned_md)
# Handle LaTeX \tag{} commands for equation numbering
cleaned_md = self._convert_tag_commands(cleaned_md)
return cleaned_md
def _convert_matrix_environments(self, md_text: str) -> str:
@@ -136,42 +512,41 @@ class Converter:
This fixes the vertical line height issues in Word.
"""
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
md_text = re.sub(
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
md_text = self._RE_VMATRIX.sub(
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
md_text,
flags=re.DOTALL,
)
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
md_text = re.sub(
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
md_text = self._RE_VMATRIX_DOUBLE.sub(
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
md_text,
flags=re.DOTALL,
)
return md_text
def _fix_array_column_specifiers(self, md_text: str) -> str:
"""Fix array environment column specifiers by removing spaces.
Pandoc's OMML converter doesn't accept spaces between column alignment
specifiers in array environments. This converts patterns like
{c c c c} to {cccc}.
"""
def remove_spaces_in_specifier(match: re.Match) -> str:
"""Remove spaces from column specifier."""
specifier = match.group(1)
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
def _fix_brace_spacing(self, md_text: str) -> str:
"""Fix spacing issues with braces in equation systems.
Removes whitespace and adds negative space for proper alignment in Word/OMML.
"""
# Fix \left\{ spacing
md_text = re.sub(
r"\\left\\\{\s+",
r"\\left\\{\\!",
md_text,
)
# Fix \right\} spacing
md_text = re.sub(
r"\s+\\right\\\}",
r"\\!\\right\\}",
md_text,
)
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
return md_text
def _convert_special_environments(self, md_text: str) -> str:
@@ -179,45 +554,45 @@ class Converter:
These environments have better rendering support in Word/OMML.
"""
# Pre-compiled pattern for alignment marker removal
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
def convert_cases(match: re.Match) -> str:
content = match.group(1)
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\begin\{cases\}(.*?)\\end\{cases\}",
convert_cases,
md_text,
flags=re.DOTALL,
)
md_text = self._RE_CASES.sub(convert_cases, md_text)
def convert_aligned_to_array(match: re.Match) -> str:
content = match.group(1)
# Remove leading & alignment markers (not needed in array{l})
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
content = _re_align_marker.sub(r"\1", content)
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
md_text = re.sub(
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
convert_aligned_to_array,
md_text,
flags=re.DOTALL,
)
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
def convert_standalone_aligned(match: re.Match) -> str:
content = match.group(1)
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
content = _re_align_marker.sub(r"\1", content)
return r"\begin{array}{l}" + content + r"\end{array}"
md_text = re.sub(
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
convert_standalone_aligned,
md_text,
flags=re.DOTALL,
)
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
return md_text
def _convert_tag_commands(self, md_text: str) -> str:
"""Convert LaTeX \\tag{} commands to Word-compatible format.
The \\tag{} command is not supported in Word OMML format, so we convert it to
use simple spacing (\\quad) to push the equation number to the right side.
"""
def convert_tag(match: re.Match) -> str:
formula_content = match.group(1)
tag_content = match.group(2)
return f"$${formula_content} \\quad ({tag_content})$$"
return self._RE_TAG.sub(convert_tag, md_text)
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
"""Export markdown to docx or pdf file.
@@ -309,4 +684,3 @@ class Converter:
"""
if os.path.exists(file_path):
os.remove(file_path)

View File

@@ -25,6 +25,38 @@ class ImageProcessor:
"""
self.padding_ratio = padding_ratio or settings.image_padding_ratio
def _convert_to_bgr(self, pil_image: Image.Image) -> np.ndarray:
"""Convert PIL Image to BGR numpy array, handling alpha channel.
Args:
pil_image: PIL Image object.
Returns:
Image as numpy array in BGR format.
"""
# Handle RGBA images (PNG with transparency)
if pil_image.mode == "RGBA":
# Create white background and paste image on top
background = Image.new("RGB", pil_image.size, (255, 255, 255))
background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha as mask
pil_image = background
elif pil_image.mode == "LA":
# Grayscale with alpha
background = Image.new("L", pil_image.size, 255)
background.paste(pil_image, mask=pil_image.split()[1])
pil_image = background.convert("RGB")
elif pil_image.mode == "P":
# Palette mode, may have transparency
pil_image = pil_image.convert("RGBA")
background = Image.new("RGB", pil_image.size, (255, 255, 255))
background.paste(pil_image, mask=pil_image.split()[3])
pil_image = background
elif pil_image.mode != "RGB":
# Convert other modes to RGB
pil_image = pil_image.convert("RGB")
return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
def load_image_from_url(self, url: str) -> np.ndarray:
"""Load image from URL.
@@ -40,8 +72,8 @@ class ImageProcessor:
try:
with urlopen(url, timeout=30) as response:
image_data = response.read()
image = Image.open(io.BytesIO(image_data))
return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
pil_image = Image.open(io.BytesIO(image_data))
return self._convert_to_bgr(pil_image)
except Exception as e:
raise ValueError(f"Failed to load image from URL: {e}") from e
@@ -63,8 +95,8 @@ class ImageProcessor:
base64_str = base64_str.split(",", 1)[1]
image_data = base64.b64decode(base64_str)
image = Image.open(io.BytesIO(image_data))
return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
pil_image = Image.open(io.BytesIO(image_data))
return self._convert_to_bgr(pil_image)
except Exception as e:
raise ValueError(f"Failed to decode base64 image: {e}") from e

View File

@@ -140,18 +140,39 @@ class LayoutDetector:
if __name__ == "__main__":
import cv2
from app.core.config import get_settings
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter
from app.services.ocr_service import OCRService
settings = get_settings()
# Initialize dependencies
layout_detector = LayoutDetector()
image_path = "test/timeout.png"
image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
converter = Converter()
# Initialize OCR service
ocr_service = OCRService(
vl_server_url=settings.paddleocr_vl_url,
layout_detector=layout_detector,
image_processor=image_processor,
converter=converter,
)
# Load test image
image_path = "test/complex_formula.png"
image = cv2.imread(image_path)
image_processor = ImageProcessor(padding_ratio=0.15)
image = image_processor.add_padding(image)
# Save the padded image for debugging
cv2.imwrite("debug_padded_image.png", image)
layout_info = layout_detector.detect(image)
print(layout_info)
if image is None:
print(f"Failed to load image: {image_path}")
else:
print(f"Image loaded: {image.shape}")
# Run OCR recognition
result = ocr_service.recognize(image)
print("\n=== OCR Result ===")
print(f"Markdown:\n{result['markdown']}")
print(f"\nLaTeX:\n{result['latex']}")
print(f"\nMathML:\n{result['mathml']}")

View File

@@ -1,17 +1,121 @@
"""PaddleOCR-VL client service for text and formula recognition."""
import re
import numpy as np
import cv2
import requests
from io import BytesIO
from app.core.config import get_settings
from paddleocr import PaddleOCRVL
from typing import Optional
from app.services.layout_detector import LayoutDetector
from app.services.image_processor import ImageProcessor
from app.services.converter import Converter
from abc import ABC, abstractmethod
settings = get_settings()
_COMMANDS_NEED_SPACE = {
# operators / calculus
"cdot",
"times",
"div",
"pm",
"mp",
"int",
"iint",
"iiint",
"oint",
"sum",
"prod",
"lim",
# common functions
"sin",
"cos",
"tan",
"cot",
"sec",
"csc",
"log",
"ln",
"exp",
# misc
"partial",
"nabla",
}
class OCRService:
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
# stage2: differentials inside math segments
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
def _split_glued_command_token(token: str) -> str:
"""Split OCR-glued LaTeX command token by whitelist longest-prefix.
Examples:
- \\cdotdS -> \\cdot dS
- \\intdx -> \\int dx
"""
if not token.startswith("\\"):
return token
body = token[1:]
if len(body) < 2:
return token
best = None
# longest prefix that is in whitelist
for i in range(1, len(body)):
prefix = body[:i]
if prefix in _COMMANDS_NEED_SPACE:
best = prefix
if not best:
return token
suffix = body[len(best) :]
if not suffix:
return token
return f"\\{best} {suffix}"
def _postprocess_math(expr: str) -> str:
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
# stage1: split glued command tokens (e.g. \cdotdS)
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
# stage2: normalize differentials (keep conservative)
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
return expr
def _postprocess_markdown(markdown_content: str) -> str:
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
if not markdown_content:
return markdown_content
def _fix_segment(m: re.Match) -> str:
seg = m.group(0)
if seg.startswith("$$") and seg.endswith("$$"):
return f"$${_postprocess_math(seg[2:-2])}$$"
if seg.startswith("$") and seg.endswith("$"):
return f"${_postprocess_math(seg[1:-1])}$"
return seg
return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
class OCRServiceBase(ABC):
@abstractmethod
def recognize(self, image: np.ndarray) -> dict:
pass
class OCRService(OCRServiceBase):
"""Service for OCR using PaddleOCR-VL."""
_pipeline: Optional[PaddleOCRVL] = None
@@ -32,10 +136,11 @@ class OCRService:
image_processor: Image processor instance.
"""
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
self.layout_detector = layout_detector
self.layout_detector = layout_detector
self.image_processor = image_processor
self.converter = converter
def _get_pipeline(self):
def _get_pipeline(self):
"""Get or create PaddleOCR-VL pipeline.
Returns:
@@ -49,7 +154,7 @@ class OCRService:
)
return OCRService._pipeline
def recognize_mixed(self, image: np.ndarray) -> dict:
def _recognize_mixed(self, image: np.ndarray) -> dict:
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
@@ -71,17 +176,19 @@ class OCRService:
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
convert_result = self.converter.convert_to_formats(markdown_content)
markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"markdown": markdown_content,
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"mml": convert_result.mml,
}
except Exception as e:
raise RuntimeError(f"Mixed recognition failed: {e}") from e
def recognize_formula(self, image: np.ndarray) -> dict:
def _recognize_formula(self, image: np.ndarray) -> dict:
"""Recognize formula/math content using PaddleOCR-VL with prompt.
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
@@ -102,11 +209,13 @@ class OCRService:
for res in output:
markdown_content += res.markdown.get("markdown_texts", "")
markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content)
return {
"latex": convert_result.latex,
"mathml": convert_result.mathml,
"mml": convert_result.mml,
"markdown": markdown_content,
}
except Exception as e:
@@ -124,18 +233,109 @@ class OCRService:
padded_image = self.image_processor.add_padding(image)
layout_info = self.layout_detector.detect(padded_image)
if layout_info.MixedRecognition:
return self.recognize_mixed(image)
return self._recognize_mixed(image)
else:
return self.recognize_formula(image)
return self._recognize_formula(image)
class MineruOCRService(OCRServiceBase):
"""Service for OCR using local file_parse API."""
def __init__(
self,
api_url: str = "http://127.0.0.1:8000/file_parse",
image_processor: Optional[ImageProcessor] = None,
converter: Optional[Converter] = None,
):
"""Initialize Local API service.
Args:
api_url: URL of the local file_parse API endpoint.
converter: Optional converter instance for format conversion.
"""
self.api_url = api_url
self.image_processor = image_processor
self.converter = converter
def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using local file_parse API.
Args:
image: Input image as numpy array in BGR format.
Returns:
Dict with 'markdown', 'latex', 'mathml' keys.
"""
try:
if self.image_processor:
image = self.image_processor.add_padding(image)
# Convert numpy array to image bytes
success, encoded_image = cv2.imencode(".png", image)
if not success:
raise RuntimeError("Failed to encode image")
image_bytes = BytesIO(encoded_image.tobytes())
# Prepare multipart form data
files = {"files": ("image.png", image_bytes, "image/png")}
data = {
"return_middle_json": "false",
"return_model_output": "false",
"return_md": "true",
"return_images": "false",
"end_page_id": "99999",
"start_page_id": "0",
"lang_list": "en",
"server_url": "string",
"return_content_list": "false",
"backend": "hybrid-auto-engine",
"table_enable": "true",
"response_format_zip": "false",
"formula_enable": "true",
"parse_method": "ocr",
}
# Make API request
response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
response.raise_for_status()
result = response.json()
# Extract markdown content from response
markdown_content = ""
if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "")
# markdown_content = _postprocess_markdown(markdown_content)
# Convert to other formats if converter is available
latex = ""
mathml = ""
mml = ""
if self.converter and markdown_content:
convert_result = self.converter.convert_to_formats(markdown_content)
latex = convert_result.latex
mathml = convert_result.mathml
mml = convert_result.mml
return {
"markdown": markdown_content,
"latex": latex,
"mathml": mathml,
"mml": mml,
}
except requests.RequestException as e:
raise RuntimeError(f"Local API request failed: {e}") from e
except Exception as e:
raise RuntimeError(f"Recognition failed: {e}") from e
if __name__ == "__main__":
import cv2
from app.services.image_processor import ImageProcessor
from app.services.layout_detector import LayoutDetector
image_processor = ImageProcessor(padding_ratio=0.15)
layout_detector = LayoutDetector()
ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
image = cv2.imread("test/image.png")
ocr_result = ocr_service.recognize(image)
print(ocr_result)
mineru_service = MineruOCRService()
image = cv2.imread("test/complex_formula.png")
image_numpy = np.array(image)
ocr_result = mineru_service.recognize(image_numpy)
print(ocr_result)

View File

@@ -26,7 +26,8 @@ dependencies = [
"pypandoc==1.16.2",
"paddlepaddle",
"paddleocr[doc-parser]",
"safetensors"
"safetensors",
"lxml>=5.0.0"
]
[tool.uv.sources]

102
test_array_fix.py Normal file
View File

@@ -0,0 +1,102 @@
"""Test script for array column specifier fix."""
from app.services.converter import Converter
def test_array_specifier_fix():
"""Test that array column specifiers with spaces are fixed."""
converter = Converter()
# The problematic LaTeX from the error
latex_formula = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
print("Testing array column specifier fix")
print("=" * 80)
print(f"\nOriginal LaTeX (first 200 chars):\n{latex_formula[:200]}...")
# Test preprocessing
print("\n" + "-" * 80)
print("Step 1: Preprocessing")
preprocessed = converter._preprocess_formula_for_omml(latex_formula)
# Check if spaces were removed from array specifiers
if "{c c c c}" in preprocessed:
print("✗ FAILED: Spaces not removed from array specifiers")
print(f"Found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+10]}")
elif "{cccc}" in preprocessed:
print("✓ SUCCESS: Spaces removed from array specifiers")
print(f"Changed '{{{\"c c c c\"}}}''{{cccc}}'")
else:
print("? Could not find array specifier in preprocessed output")
# Test OMML conversion
print("\n" + "-" * 80)
print("Step 2: OMML Conversion")
try:
omml = converter.convert_to_omml(latex_formula)
print(f"✓ SUCCESS: OMML conversion completed")
print(f"OMML length: {len(omml)} characters")
print(f"OMML preview (first 300 chars):\n{omml[:300]}...")
# Check if it contains oMath element
if "oMath" in omml:
print("\n✓ Valid OMML: Contains oMath element")
else:
print("\n✗ WARNING: OMML might be incomplete (no oMath element found)")
except Exception as e:
print(f"✗ FAILED: OMML conversion error")
print(f"Error: {e}")
return False
print("\n" + "=" * 80)
print("✓ All tests passed!")
return True
def test_simple_array():
"""Test with a simpler array example."""
converter = Converter()
print("\nTesting simple array")
print("=" * 80)
# Simple array with spaces in column specifier
latex_formula = r"\begin{array}{c c c} a & b & c \\ d & e & f \end{array}"
print(f"LaTeX: {latex_formula}")
try:
omml = converter.convert_to_omml(latex_formula)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
print(f"Preview: {omml[:200]}...")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
if __name__ == "__main__":
print("Array Column Specifier Fix Test Suite\n")
try:
test1 = test_simple_array()
test2 = test_array_specifier_fix()
if test1 and test2:
print("\n" + "=" * 80)
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
print("=" * 80)
else:
print("\n" + "=" * 80)
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted by user")
except Exception as e:
print(f"\n\nTest suite error: {e}")
import traceback
traceback.print_exc()

57
test_converter.py Normal file
View File

@@ -0,0 +1,57 @@
"""Test script for converter functionality."""
from app.services.converter import Converter
def test_latex_only_conversion():
"""Test conversion of LaTeX-only content."""
converter = Converter()
# Test case 1: Display math with $$...$$
latex_input = "$$E = mc^2$$"
result = converter.convert_to_formats(latex_input)
print("Test 1: Display math ($$...$$)")
print(f"Input: {latex_input}")
print(f"LaTeX: {result.latex}")
print(f"MathML: {result.mathml[:100]}...")
print(f"MML: {result.mml[:100]}...")
print(f"OMML: {result.omml[:100] if result.omml else 'Empty'}...")
print()
# Test case 2: Inline math with $...$
latex_input2 = "$\\frac{a}{b}$"
result2 = converter.convert_to_formats(latex_input2)
print("Test 2: Inline math ($...$)")
print(f"Input: {latex_input2}")
print(f"LaTeX: {result2.latex}")
print(f"MathML: {result2.mathml[:100]}...")
print()
# Test case 3: Complex formula
latex_input3 = "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$"
result3 = converter.convert_to_formats(latex_input3)
print("Test 3: Complex formula")
print(f"Input: {latex_input3}")
print(f"LaTeX: {result3.latex}")
print(f"MathML: {result3.mathml[:150]}...")
print(f"OMML length: {len(result3.omml)}")
print()
# Test case 4: Regular markdown (not LaTeX-only)
markdown_input = "# Hello\n\nThis is a test with math: $x = 2$"
result4 = converter.convert_to_formats(markdown_input)
print("Test 4: Regular markdown")
print(f"Input: {markdown_input}")
print(f"LaTeX: {result4.latex[:100]}...")
print(f"MathML: {result4.mathml[:100]}...")
print(f"MML: {result4.mml}")
print(f"OMML: {result4.omml}")
print()
if __name__ == "__main__":
test_latex_only_conversion()

112
test_omml_api.py Normal file
View File

@@ -0,0 +1,112 @@
"""Test script for OMML conversion API endpoint."""
import requests
import json
def test_latex_to_omml():
"""Test the /convert/latex-to-omml endpoint."""
# Test cases
test_cases = [
{
"name": "Simple fraction",
"latex": "\\frac{a}{b}",
},
{
"name": "Quadratic formula",
"latex": "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}",
},
{
"name": "Integral",
"latex": "\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}",
},
{
"name": "Matrix",
"latex": "\\begin{matrix} a & b \\\\ c & d \\end{matrix}",
},
]
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
print("Testing OMML Conversion API")
print("=" * 80)
for i, test_case in enumerate(test_cases, 1):
print(f"\nTest {i}: {test_case['name']}")
print("-" * 80)
print(f"LaTeX: {test_case['latex']}")
try:
response = requests.post(
base_url,
json={"latex": test_case["latex"]},
headers={"Content-Type": "application/json"},
timeout=10,
)
if response.status_code == 200:
result = response.json()
omml = result.get("omml", "")
print(f"✓ Status: {response.status_code}")
print(f"OMML length: {len(omml)} characters")
print(f"OMML preview: {omml[:150]}...")
else:
print(f"✗ Status: {response.status_code}")
print(f"Error: {response.text}")
except requests.exceptions.RequestException as e:
print(f"✗ Request failed: {e}")
except Exception as e:
print(f"✗ Error: {e}")
print("\n" + "=" * 80)
def test_invalid_input():
"""Test error handling with invalid input."""
print("\nTesting Error Handling")
print("=" * 80)
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
# Empty LaTeX
print("\nTest: Empty LaTeX")
response = requests.post(
base_url,
json={"latex": ""},
headers={"Content-Type": "application/json"},
)
print(f"Status: {response.status_code}")
print(f"Response: {response.json()}")
# Missing LaTeX field
print("\nTest: Missing LaTeX field")
response = requests.post(
base_url,
json={},
headers={"Content-Type": "application/json"},
)
print(f"Status: {response.status_code}")
print(f"Response: {response.json()}")
print("\n" + "=" * 80)
if __name__ == "__main__":
print("OMML API Test Suite")
print("Make sure the API server is running on http://localhost:8000")
print()
try:
test_latex_to_omml()
test_invalid_input()
print("\n✓ All tests completed!")
except KeyboardInterrupt:
print("\n\n✗ Tests interrupted by user")
except Exception as e:
print(f"\n✗ Test suite failed: {e}")

218
test_omml_preprocessing.py Normal file
View File

@@ -0,0 +1,218 @@
"""Comprehensive test for OMML conversion with preprocessing."""
from app.services.converter import Converter
def test_case_1_array_with_spaces():
"""Test: Array with spaces in column specifier (the original issue)."""
print("\n" + "=" * 80)
print("Test 1: Array with spaces in column specifier")
print("=" * 80)
converter = Converter()
# The problematic LaTeX from the error
latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
print(f"LaTeX length: {len(latex)} chars")
print(f"Preview: {latex[:100]}...")
try:
omml = converter.convert_to_omml(latex)
print(f"\n✓ SUCCESS: Converted to OMML")
print(f"OMML length: {len(omml)} chars")
if "oMath" in omml:
print("✓ Valid OMML structure detected")
# Check preprocessing worked
preprocessed = converter._preprocess_formula_for_omml(latex)
if "{c c c c}" not in preprocessed and "{cccc}" in preprocessed:
print("✓ Array column specifiers fixed: '{c c c c}''{cccc}'")
return True
except Exception as e:
print(f"\n✗ FAILED: {e}")
return False
def test_case_2_vmatrix():
"""Test: vmatrix environment conversion."""
print("\n" + "=" * 80)
print("Test 2: vmatrix environment")
print("=" * 80)
converter = Converter()
latex = r"\begin{vmatrix} a & b \\ c & d \end{vmatrix}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if vmatrix was converted
preprocessed = converter._preprocess_formula_for_omml(latex)
if "vmatrix" not in preprocessed and r"\left|" in preprocessed:
print("✓ vmatrix converted to \\left| ... \\right|")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_3_cases_environment():
"""Test: cases environment conversion."""
print("\n" + "=" * 80)
print("Test 3: cases environment")
print("=" * 80)
converter = Converter()
latex = r"f(x) = \begin{cases} x^2 & x \geq 0 \\ -x & x < 0 \end{cases}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if cases was converted to array
preprocessed = converter._preprocess_formula_for_omml(latex)
if "cases" not in preprocessed and "array" in preprocessed:
print("✓ cases converted to array environment")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_4_aligned_environment():
"""Test: aligned environment conversion."""
print("\n" + "=" * 80)
print("Test 4: aligned environment")
print("=" * 80)
converter = Converter()
latex = r"\begin{aligned} x + y &= 5 \\ 2x - y &= 1 \end{aligned}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
# Check if aligned was converted
preprocessed = converter._preprocess_formula_for_omml(latex)
if "aligned" not in preprocessed and "array" in preprocessed:
print("✓ aligned converted to array environment")
if "&" not in preprocessed or preprocessed.count("&") < latex.count("&"):
print("✓ Alignment markers removed")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_5_simple_formula():
"""Test: Simple formula (should work without preprocessing)."""
print("\n" + "=" * 80)
print("Test 5: Simple formula")
print("=" * 80)
converter = Converter()
latex = r"x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}"
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
def test_case_6_nested_structures():
"""Test: Nested structures with multiple issues."""
print("\n" + "=" * 80)
print("Test 6: Nested structures")
print("=" * 80)
converter = Converter()
latex = r"\left\{ \begin{array}{l c} \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ f(x) = \begin{cases} 1 & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{step function} \end{array} \right."
print(f"LaTeX: {latex}")
try:
omml = converter.convert_to_omml(latex)
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
preprocessed = converter._preprocess_formula_for_omml(latex)
print("\nPreprocessing applied:")
if "vmatrix" not in preprocessed:
print(" ✓ vmatrix converted")
if "cases" not in preprocessed:
print(" ✓ cases converted")
if "{l c}" not in preprocessed and "{lc}" in preprocessed:
print(" ✓ Array specifiers fixed")
return True
except Exception as e:
print(f"✗ FAILED: {e}")
return False
if __name__ == "__main__":
print("=" * 80)
print("OMML CONVERSION TEST SUITE")
print("Testing preprocessing and conversion")
print("=" * 80)
results = []
try:
results.append(("Simple formula", test_case_5_simple_formula()))
results.append(("Array with spaces", test_case_1_array_with_spaces()))
results.append(("vmatrix", test_case_2_vmatrix()))
results.append(("cases", test_case_3_cases_environment()))
results.append(("aligned", test_case_4_aligned_environment()))
results.append(("Nested structures", test_case_6_nested_structures()))
# Summary
print("\n" + "=" * 80)
print("TEST SUMMARY")
print("=" * 80)
passed = sum(1 for _, result in results if result)
total = len(results)
for name, result in results:
status = "✓ PASS" if result else "✗ FAIL"
print(f"{status}: {name}")
print("\n" + "-" * 80)
print(f"Total: {passed}/{total} tests passed")
if passed == total:
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
else:
print(f"\n✗✗✗ {total - passed} TESTS FAILED ✗✗✗")
print("=" * 80)
except KeyboardInterrupt:
print("\n\nTests interrupted by user")
except Exception as e:
print(f"\n\nTest suite error: {e}")
import traceback
traceback.print_exc()