Compare commits
4 Commits
main
...
feature/co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e31017cfe7 | ||
|
|
69f9a70ae5 | ||
|
|
27f25d9f4d | ||
|
|
526c1f3a0d |
@@ -1,10 +1,10 @@
|
||||
"""Markdown to DOCX conversion endpoint."""
|
||||
"""Format conversion endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import Response
|
||||
|
||||
from app.core.dependencies import get_converter
|
||||
from app.schemas.convert import MarkdownToDocxRequest
|
||||
from app.schemas.convert import MarkdownToDocxRequest, LatexToOmmlRequest, LatexToOmmlResponse
|
||||
from app.services.converter import Converter
|
||||
|
||||
router = APIRouter()
|
||||
@@ -28,3 +28,39 @@ async def convert_markdown_to_docx(
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
|
||||
|
||||
|
||||
@router.post("/latex-to-omml", response_model=LatexToOmmlResponse)
|
||||
async def convert_latex_to_omml(
|
||||
request: LatexToOmmlRequest,
|
||||
converter: Converter = Depends(get_converter),
|
||||
) -> LatexToOmmlResponse:
|
||||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||||
|
||||
OMML is the math format used by Microsoft Word and other Office applications.
|
||||
This endpoint is separate from the main OCR endpoint due to the performance
|
||||
overhead of OMML conversion (requires creating a temporary DOCX file).
|
||||
|
||||
Args:
|
||||
request: Contains the LaTeX formula to convert (without $ or $$ delimiters).
|
||||
|
||||
Returns:
|
||||
OMML representation of the formula.
|
||||
|
||||
Example:
|
||||
```bash
|
||||
curl -X POST "http://localhost:8000/api/v1/convert/latex-to-omml" \\
|
||||
-H "Content-Type: application/json" \\
|
||||
-d '{"latex": "\\\\frac{a}{b} + \\\\sqrt{c}"}'
|
||||
```
|
||||
"""
|
||||
if not request.latex or not request.latex.strip():
|
||||
raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(request.latex)
|
||||
return LatexToOmmlResponse(omml=omml)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except RuntimeError as e:
|
||||
raise HTTPException(status_code=503, detail=str(e))
|
||||
|
||||
@@ -28,6 +28,9 @@ async def process_image_ocr(
|
||||
- If plain text exists: use PP-DocLayoutV2 for mixed recognition
|
||||
- Otherwise: use PaddleOCR-VL with formula prompt
|
||||
4. Convert output to LaTeX, Markdown, and MathML formats
|
||||
|
||||
Note: OMML conversion is not included due to performance overhead.
|
||||
Use the /convert/latex-to-omml endpoint to convert LaTeX to OMML separately.
|
||||
"""
|
||||
|
||||
image = image_processor.preprocess(
|
||||
@@ -49,4 +52,5 @@ async def process_image_ocr(
|
||||
latex=ocr_result.get("latex", ""),
|
||||
markdown=ocr_result.get("markdown", ""),
|
||||
mathml=ocr_result.get("mathml", ""),
|
||||
mml=ocr_result.get("mml", ""),
|
||||
)
|
||||
|
||||
@@ -33,14 +33,13 @@ app = FastAPI(
|
||||
app.include_router(api_router, prefix=settings.api_prefix)
|
||||
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
return {"status": "healthy"}
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8053)
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=settings.port)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Request and response schemas for markdown to DOCX conversion endpoint."""
|
||||
"""Request and response schemas for format conversion endpoints."""
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
@@ -17,3 +17,23 @@ class MarkdownToDocxRequest(BaseModel):
|
||||
raise ValueError("Markdown content cannot be empty")
|
||||
return v
|
||||
|
||||
|
||||
class LatexToOmmlRequest(BaseModel):
|
||||
"""Request body for LaTeX to OMML conversion endpoint."""
|
||||
|
||||
latex: str = Field(..., description="Pure LaTeX formula (without $ or $$ delimiters)")
|
||||
|
||||
@field_validator("latex")
|
||||
@classmethod
|
||||
def validate_latex_not_empty(cls, v: str) -> str:
|
||||
"""Validate that LaTeX formula is not empty."""
|
||||
if not v or not v.strip():
|
||||
raise ValueError("LaTeX formula cannot be empty")
|
||||
return v
|
||||
|
||||
|
||||
class LatexToOmmlResponse(BaseModel):
|
||||
"""Response body for LaTeX to OMML conversion endpoint."""
|
||||
|
||||
omml: str = Field("", description="OMML (Office Math Markup Language) representation")
|
||||
|
||||
|
||||
@@ -40,11 +40,10 @@ class ImageOCRRequest(BaseModel):
|
||||
class ImageOCRResponse(BaseModel):
|
||||
"""Response body for image OCR endpoint."""
|
||||
|
||||
latex: str = Field("", description="LaTeX representation of the content")
|
||||
latex: str = Field("", description="LaTeX representation of the content (empty if mixed content)")
|
||||
markdown: str = Field("", description="Markdown representation of the content")
|
||||
mathml: str = Field("", description="MathML representation (empty if no math detected)")
|
||||
mathml: str = Field("", description="Standard MathML representation (empty if mixed content)")
|
||||
mml: str = Field("", description="XML MathML with mml: namespace prefix (empty if mixed content)")
|
||||
layout_info: LayoutInfo = Field(default_factory=LayoutInfo)
|
||||
recognition_mode: str = Field(
|
||||
"", description="Recognition mode used: mixed_recognition or formula_recognition"
|
||||
)
|
||||
recognition_mode: str = Field("", description="Recognition mode used: mixed_recognition or formula_recognition")
|
||||
|
||||
|
||||
@@ -4,17 +4,29 @@ import os
|
||||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from functools import lru_cache
|
||||
from typing import Literal
|
||||
|
||||
import pypandoc
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConvertResult:
|
||||
"""Result of markdown conversion."""
|
||||
"""Result of markdown conversion.
|
||||
|
||||
Only populated when input contains pure LaTeX formula.
|
||||
All fields are empty strings when input contains mixed content (text + formula).
|
||||
|
||||
Attributes:
|
||||
latex: Pure LaTeX formula code (without delimiters).
|
||||
mathml: Standard MathML format.
|
||||
mml: XML MathML with mml: namespace prefix (mml:math).
|
||||
"""
|
||||
|
||||
latex: str
|
||||
mathml: str
|
||||
mml: str
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -28,59 +40,430 @@ class ExportResult:
|
||||
|
||||
ExportType = Literal["docx", "pdf"]
|
||||
|
||||
# MathML namespace
|
||||
MATHML_NAMESPACE = "http://www.w3.org/1998/Math/MathML"
|
||||
OMML_NAMESPACE = "http://schemas.openxmlformats.org/officeDocument/2006/math"
|
||||
|
||||
# XSLT for MathML to mml: namespace conversion
|
||||
MML_XSLT = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:mml="http://www.w3.org/1998/Math/MathML"
|
||||
xmlns:m="http://www.w3.org/1998/Math/MathML"
|
||||
exclude-result-prefixes="m">
|
||||
|
||||
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
|
||||
|
||||
<!-- Match root math element -->
|
||||
<xsl:template match="m:math|math">
|
||||
<mml:math>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</mml:math>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Match all other MathML elements -->
|
||||
<xsl:template match="m:*|mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|maction|semantics|annotation|annotation-xml">
|
||||
<xsl:element name="mml:{local-name()}">
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Copy attributes -->
|
||||
<xsl:template match="@*">
|
||||
<xsl:if test="local-name() != 'xmlns'">
|
||||
<xsl:copy/>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Copy text nodes -->
|
||||
<xsl:template match="text()">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
"""
|
||||
|
||||
|
||||
class Converter:
|
||||
"""Service for conversion and export operations."""
|
||||
"""Service for conversion and export operations.
|
||||
|
||||
Conversion rules:
|
||||
- Only pure LaTeX formulas can be converted to latex/mathml/mml formats.
|
||||
- Mixed content (text + formula) returns empty results for all formats.
|
||||
- OMML conversion is provided as a separate method due to performance overhead.
|
||||
|
||||
Performance optimizations:
|
||||
- Pre-compiled regex patterns
|
||||
- XSLT-based MML conversion
|
||||
- Cached XSLT transforms
|
||||
- Direct Pandoc OMML output (avoids DOCX parsing)
|
||||
"""
|
||||
|
||||
# Pandoc input format with LaTeX math extensions
|
||||
INPUT_FORMAT = "markdown+raw_tex+tex_math_dollars+tex_math_double_backslash"
|
||||
|
||||
# Pre-compiled regex patterns for formula detection
|
||||
_RE_DISPLAY_DOLLAR = re.compile(r"\$\$[\s\S]+\$\$")
|
||||
_RE_DISPLAY_BRACKET = re.compile(r"\\\[[\s\S]+\\\]")
|
||||
_RE_INLINE_DOLLAR = re.compile(r"\$(?!\$)[^\$]+\$(?!\$)")
|
||||
_RE_INLINE_PAREN = re.compile(r"\\\([\s\S]+\\\)")
|
||||
_RE_MATH_ELEMENT = re.compile(r"<math[^>]*>[\s\S]*?</math>")
|
||||
|
||||
# Pre-compiled regex patterns for preprocessing
|
||||
_RE_VSPACE = re.compile(r"\\\[1mm\]")
|
||||
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
|
||||
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
|
||||
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
|
||||
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
|
||||
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
|
||||
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
|
||||
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
|
||||
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
|
||||
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
|
||||
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
|
||||
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
|
||||
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
|
||||
_RE_VMATRIX_DOUBLE = re.compile(r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", re.DOTALL)
|
||||
|
||||
# Cached XSLT transform
|
||||
_mml_xslt_transform = None
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize converter."""
|
||||
|
||||
@classmethod
|
||||
def _get_mml_xslt_transform(cls):
|
||||
"""Get cached XSLT transform for MathML to mml: conversion."""
|
||||
if cls._mml_xslt_transform is None:
|
||||
from lxml import etree
|
||||
xslt_doc = etree.fromstring(MML_XSLT.encode("utf-8"))
|
||||
cls._mml_xslt_transform = etree.XSLT(xslt_doc)
|
||||
return cls._mml_xslt_transform
|
||||
|
||||
def _is_formula_only(self, text: str) -> bool:
|
||||
"""Check if text contains only a LaTeX formula (no mixed content).
|
||||
|
||||
A text is considered formula-only if it matches one of these patterns:
|
||||
- Display math: $$...$$ or \\[...\\]
|
||||
- Inline math: $...$ or \\(...\\)
|
||||
|
||||
Args:
|
||||
text: Input text to check.
|
||||
|
||||
Returns:
|
||||
True if the text contains only a LaTeX formula, False otherwise.
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Strict patterns: entire text must be a single formula with delimiters
|
||||
# Using pre-compiled patterns with fullmatch semantics
|
||||
if self._RE_DISPLAY_DOLLAR.fullmatch(text):
|
||||
return True
|
||||
if self._RE_DISPLAY_BRACKET.fullmatch(text):
|
||||
return True
|
||||
if self._RE_INLINE_DOLLAR.fullmatch(text):
|
||||
return True
|
||||
if self._RE_INLINE_PAREN.fullmatch(text):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert_to_formats(self, md_text: str) -> ConvertResult:
|
||||
"""Convert markdown to LaTeX and MathML formats.
|
||||
"""Convert markdown to LaTeX, MathML, and MML formats.
|
||||
|
||||
Only converts when input contains a pure LaTeX formula.
|
||||
Mixed content (text + formula) returns empty strings for all fields.
|
||||
|
||||
Args:
|
||||
md_text: Markdown text to convert.
|
||||
|
||||
Returns:
|
||||
ConvertResult with latex and mathml fields.
|
||||
ConvertResult with latex, mathml, and mml fields.
|
||||
All fields are empty if input is not a pure formula.
|
||||
|
||||
Raises:
|
||||
ValueError: If md_text is empty.
|
||||
RuntimeError: If conversion fails.
|
||||
RuntimeError: If conversion fails for a valid formula.
|
||||
"""
|
||||
if md_text == "":
|
||||
return ConvertResult(latex="", mathml="")
|
||||
# Empty input returns empty result
|
||||
if not md_text or not md_text.strip():
|
||||
return ConvertResult(latex="", mathml="", mml="")
|
||||
|
||||
# Check if input is formula-only
|
||||
if not self._is_formula_only(md_text):
|
||||
# Mixed content: cannot convert to formula formats
|
||||
return ConvertResult(latex="", mathml="", mml="")
|
||||
|
||||
try:
|
||||
# Convert to LaTeX
|
||||
latex_output = pypandoc.convert_text(
|
||||
md_text,
|
||||
"latex",
|
||||
format=self.INPUT_FORMAT,
|
||||
).rstrip("\n")
|
||||
# Extract the LaTeX formula content (remove delimiters)
|
||||
latex_formula = self._extract_latex_formula(md_text)
|
||||
|
||||
# Convert to HTML with MathML
|
||||
mathml_output = pypandoc.convert_text(
|
||||
md_text,
|
||||
"html",
|
||||
format=self.INPUT_FORMAT,
|
||||
extra_args=["--mathml"],
|
||||
).rstrip("\n")
|
||||
# Convert to MathML
|
||||
mathml = self._latex_to_mathml(latex_formula)
|
||||
|
||||
return ConvertResult(latex=latex_output, mathml=mathml_output)
|
||||
# Convert MathML to mml:math format (with namespace prefix)
|
||||
mml = self._mathml_to_mml(mathml)
|
||||
|
||||
return ConvertResult(latex=latex_formula, mathml=mathml, mml=mml)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Conversion failed: {e}") from e
|
||||
|
||||
def convert_to_omml(self, latex_formula: str) -> str:
|
||||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||||
|
||||
This is a separate method due to the performance overhead of OMML conversion,
|
||||
which requires creating a temporary DOCX file.
|
||||
|
||||
The formula is preprocessed using the same logic as export_to_file to ensure
|
||||
proper conversion.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula (without delimiters like $ or $$).
|
||||
|
||||
Returns:
|
||||
OMML representation as XML string.
|
||||
|
||||
Raises:
|
||||
ValueError: If latex_formula is empty.
|
||||
RuntimeError: If conversion fails.
|
||||
"""
|
||||
if not latex_formula or not latex_formula.strip():
|
||||
raise ValueError("LaTeX formula cannot be empty")
|
||||
|
||||
# Preprocess formula using the same preprocessing as export
|
||||
preprocessed = self._preprocess_formula_for_omml(latex_formula.strip())
|
||||
|
||||
return self._latex_to_omml(preprocessed)
|
||||
|
||||
def _preprocess_formula_for_omml(self, latex_formula: str) -> str:
|
||||
"""Preprocess LaTeX formula for OMML conversion.
|
||||
|
||||
Applies the same preprocessing steps as preprocess_for_export to ensure
|
||||
consistency. This fixes common issues that cause Pandoc OMML conversion to fail.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula.
|
||||
|
||||
Returns:
|
||||
Preprocessed LaTeX formula.
|
||||
"""
|
||||
# Use the same preprocessing methods as export
|
||||
# 1. Convert matrix environments
|
||||
latex_formula = self._convert_matrix_environments(latex_formula)
|
||||
|
||||
# 2. Fix array column specifiers (remove spaces)
|
||||
latex_formula = self._fix_array_column_specifiers(latex_formula)
|
||||
|
||||
# 3. Fix brace spacing
|
||||
latex_formula = self._fix_brace_spacing(latex_formula)
|
||||
|
||||
# 4. Convert special environments (cases, aligned)
|
||||
latex_formula = self._convert_special_environments(latex_formula)
|
||||
|
||||
return latex_formula
|
||||
|
||||
def _extract_latex_formula(self, text: str) -> str:
|
||||
"""Extract LaTeX formula from text by removing delimiters.
|
||||
|
||||
Args:
|
||||
text: Text containing LaTeX formula with delimiters.
|
||||
|
||||
Returns:
|
||||
Pure LaTeX formula without delimiters.
|
||||
"""
|
||||
text = text.strip()
|
||||
|
||||
# Remove display math delimiters: $$...$$ or \[...\]
|
||||
if text.startswith("$$") and text.endswith("$$"):
|
||||
return text[2:-2].strip()
|
||||
if text.startswith("\\[") and text.endswith("\\]"):
|
||||
return text[2:-2].strip()
|
||||
|
||||
# Remove inline math delimiters: $...$ or \(...\)
|
||||
if text.startswith("$") and text.endswith("$") and not text.startswith("$$"):
|
||||
return text[1:-1].strip()
|
||||
if text.startswith("\\(") and text.endswith("\\)"):
|
||||
return text[2:-2].strip()
|
||||
|
||||
# If no delimiters, return as-is
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=256)
|
||||
def _latex_to_mathml_cached(latex_formula: str) -> str:
|
||||
"""Cached conversion of LaTeX formula to MathML.
|
||||
|
||||
Uses LRU cache to avoid recomputing for repeated formulas.
|
||||
"""
|
||||
try:
|
||||
# Use latex2mathml library for conversion (fast, pure Python)
|
||||
return latex_to_mathml(latex_formula)
|
||||
except Exception as e:
|
||||
# Fallback: try with Pandoc (slower, but more robust)
|
||||
try:
|
||||
mathml_html = pypandoc.convert_text(
|
||||
f"${latex_formula}$",
|
||||
"html",
|
||||
format="markdown+tex_math_dollars",
|
||||
extra_args=["--mathml"],
|
||||
)
|
||||
# Extract just the <math> element from the HTML
|
||||
match = Converter._RE_MATH_ELEMENT.search(mathml_html)
|
||||
if match:
|
||||
return match.group(0)
|
||||
return mathml_html.rstrip("\n")
|
||||
except Exception as pandoc_error:
|
||||
raise RuntimeError(
|
||||
f"MathML conversion failed: {e}. Pandoc fallback also failed: {pandoc_error}"
|
||||
) from e
|
||||
|
||||
def _latex_to_mathml(self, latex_formula: str) -> str:
|
||||
"""Convert LaTeX formula to standard MathML.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula (without delimiters).
|
||||
|
||||
Returns:
|
||||
Standard MathML representation.
|
||||
"""
|
||||
return self._latex_to_mathml_cached(latex_formula)
|
||||
|
||||
def _mathml_to_mml(self, mathml: str) -> str:
|
||||
"""Convert standard MathML to mml:math format with namespace prefix.
|
||||
|
||||
Uses XSLT for efficient transformation. Transforms:
|
||||
- <math ...> to <mml:math xmlns:mml="..." ...>
|
||||
- All child elements like <mi>, <mo> to <mml:mi>, <mml:mo>
|
||||
|
||||
Args:
|
||||
mathml: Standard MathML string.
|
||||
|
||||
Returns:
|
||||
MathML with mml: namespace prefix.
|
||||
"""
|
||||
if not mathml:
|
||||
return ""
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
|
||||
# Parse MathML
|
||||
root = etree.fromstring(mathml.encode("utf-8"))
|
||||
|
||||
# Apply XSLT transformation (cached)
|
||||
transform = self._get_mml_xslt_transform()
|
||||
result_tree = transform(root)
|
||||
|
||||
# Serialize to string
|
||||
return str(result_tree)
|
||||
|
||||
except Exception:
|
||||
# Fallback: simple string replacement (less robust but no lxml dependency)
|
||||
result = mathml
|
||||
# Add namespace to root math element
|
||||
result = re.sub(
|
||||
r"<math\b",
|
||||
f'<mml:math xmlns:mml="{MATHML_NAMESPACE}"',
|
||||
result,
|
||||
)
|
||||
result = re.sub(r"</math>", "</mml:math>", result)
|
||||
|
||||
# Add mml: prefix to all other elements using a single regex
|
||||
# Match opening tags
|
||||
result = re.sub(
|
||||
r"<(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
||||
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
||||
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
||||
r"maction|semantics|annotation|annotation-xml)\b",
|
||||
r"<mml:\1",
|
||||
result,
|
||||
)
|
||||
# Match closing tags
|
||||
result = re.sub(
|
||||
r"</(mi|mo|mn|ms|mtext|mspace|mrow|mfrac|msqrt|mroot|mstyle|merror|"
|
||||
r"mpadded|mphantom|mfenced|menclose|msub|msup|msubsup|munder|mover|"
|
||||
r"munderover|mmultiscripts|mtable|mtr|mtd|maligngroup|malignmark|"
|
||||
r"maction|semantics|annotation|annotation-xml)>",
|
||||
r"</mml:\1>",
|
||||
result,
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def _latex_to_omml(self, latex_formula: str) -> str:
|
||||
"""Convert LaTeX formula to OMML (Office Math Markup Language).
|
||||
|
||||
Uses Pandoc to create DOCX in memory and extracts OMML from it.
|
||||
Optimized to minimize disk I/O by using in-memory zip processing.
|
||||
|
||||
Args:
|
||||
latex_formula: Pure LaTeX formula (without delimiters).
|
||||
|
||||
Returns:
|
||||
OMML representation as XML string.
|
||||
"""
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
try:
|
||||
from lxml import etree
|
||||
|
||||
# Convert to DOCX bytes using Pandoc
|
||||
# We still need a temp file for input, but output goes to temp file too
|
||||
# Then we process the DOCX in memory
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
||||
f.write(f"$${latex_formula}$$\n")
|
||||
temp_md = f.name
|
||||
|
||||
temp_docx = temp_md.replace(".md", ".docx")
|
||||
|
||||
try:
|
||||
pypandoc.convert_file(
|
||||
temp_md,
|
||||
"docx",
|
||||
format=self.INPUT_FORMAT,
|
||||
outputfile=temp_docx,
|
||||
)
|
||||
|
||||
# Read DOCX into memory and process as ZIP
|
||||
with open(temp_docx, "rb") as f:
|
||||
docx_bytes = f.read()
|
||||
|
||||
# Extract document.xml from DOCX (which is a ZIP file)
|
||||
with zipfile.ZipFile(io.BytesIO(docx_bytes), "r") as zf:
|
||||
document_xml = zf.read("word/document.xml")
|
||||
|
||||
# Parse XML and extract OMML
|
||||
root = etree.fromstring(document_xml)
|
||||
|
||||
# Find all oMath elements
|
||||
omml_parts = []
|
||||
for math in root.findall(f".//{{{OMML_NAMESPACE}}}oMath"):
|
||||
omml_parts.append(etree.tostring(math, encoding="unicode"))
|
||||
|
||||
return "\n".join(omml_parts)
|
||||
|
||||
finally:
|
||||
# Cleanup temp files
|
||||
if os.path.exists(temp_md):
|
||||
os.remove(temp_md)
|
||||
if os.path.exists(temp_docx):
|
||||
os.remove(temp_docx)
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"OMML conversion failed: {e}") from e
|
||||
|
||||
def preprocess_for_export(self, md_text: str) -> str:
|
||||
"""Preprocess markdown text for export to docx/pdf.
|
||||
|
||||
Handles LaTeX formula formatting, matrix environments, and
|
||||
other transformations needed for proper Word/PDF rendering.
|
||||
|
||||
Uses pre-compiled regex patterns for better performance.
|
||||
|
||||
Args:
|
||||
md_text: Raw markdown text.
|
||||
|
||||
@@ -88,36 +471,23 @@ class Converter:
|
||||
Preprocessed markdown text.
|
||||
"""
|
||||
# Replace \[1mm] => \vspace{1mm}
|
||||
md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text)
|
||||
md_text = self._RE_VSPACE.sub(r"\\vspace{1mm}", md_text)
|
||||
|
||||
# Add blank lines around \[...\] block formulas
|
||||
md_text = re.sub(
|
||||
r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])",
|
||||
r"\1\n\n\\[\3\\]\n\n\4",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
md_text = re.sub(
|
||||
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)",
|
||||
r"\n\\[\2\\]\n",
|
||||
md_text,
|
||||
flags=re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
md_text = self._RE_BLOCK_FORMULA_INLINE.sub(r"\1\n\n\\[\3\\]\n\n\4", md_text)
|
||||
md_text = self._RE_BLOCK_FORMULA_LINE.sub(r"\n\\[\2\\]\n", md_text)
|
||||
|
||||
# Remove arithmatex span wrappers
|
||||
cleaned_md = re.sub(r'<span class="arithmatex">(.*?)</span>', r"\1", md_text)
|
||||
cleaned_md = self._RE_ARITHMATEX.sub(r"\1", md_text)
|
||||
|
||||
# Convert inline formulas: \( \) => $ $
|
||||
cleaned_md = re.sub(r"\\\(", r"$", cleaned_md)
|
||||
cleaned_md = re.sub(r"\\\)", r"$", cleaned_md)
|
||||
cleaned_md = cleaned_md.replace("\\(", "$").replace("\\)", "$")
|
||||
|
||||
# Convert block formulas: \[ \] => $$ $$
|
||||
cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md)
|
||||
cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md)
|
||||
cleaned_md = cleaned_md.replace("\\[", "$$").replace("\\]", "$$")
|
||||
|
||||
# Remove spaces between $ and formula content
|
||||
# Use negative lookahead/lookbehind to avoid matching $$ block formulas
|
||||
cleaned_md = re.sub(r"(?<!\$)\$ +(.+?) +\$(?!\$)", r"$\1$", cleaned_md)
|
||||
cleaned_md = self._RE_INLINE_SPACE.sub(r"$\1$", cleaned_md)
|
||||
|
||||
# Convert matrix environments for better Word rendering
|
||||
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||||
@@ -142,19 +512,15 @@ class Converter:
|
||||
This fixes the vertical line height issues in Word.
|
||||
"""
|
||||
# vmatrix -> \left| \begin{matrix}...\end{matrix} \right|
|
||||
md_text = re.sub(
|
||||
r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}",
|
||||
md_text = self._RE_VMATRIX.sub(
|
||||
r"\\left| \\begin{matrix}\1\\end{matrix} \\right|",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
# Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\|
|
||||
md_text = re.sub(
|
||||
r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}",
|
||||
md_text = self._RE_VMATRIX_DOUBLE.sub(
|
||||
r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|",
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
return md_text
|
||||
@@ -165,50 +531,22 @@ class Converter:
|
||||
Pandoc's OMML converter doesn't accept spaces between column alignment
|
||||
specifiers in array environments. This converts patterns like
|
||||
{c c c c} to {cccc}.
|
||||
|
||||
Args:
|
||||
md_text: Markdown text with LaTeX formulas.
|
||||
|
||||
Returns:
|
||||
Markdown text with fixed array column specifiers.
|
||||
"""
|
||||
|
||||
def remove_spaces_in_specifier(match: re.Match) -> str:
|
||||
"""Remove spaces from column specifier."""
|
||||
specifier = match.group(1)
|
||||
# Remove all spaces from the specifier
|
||||
specifier_no_spaces = re.sub(r"\s+", "", specifier)
|
||||
return f"\\begin{{array}}{{{specifier_no_spaces}}}"
|
||||
return f"\\begin{{array}}{{{specifier.replace(' ', '')}}}"
|
||||
|
||||
# Match \begin{array}{...} and remove spaces in the column specifier
|
||||
# Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
|
||||
md_text = re.sub(
|
||||
r"\\begin\{array\}\{([^}]+)\}",
|
||||
remove_spaces_in_specifier,
|
||||
md_text,
|
||||
)
|
||||
|
||||
return md_text
|
||||
return self._RE_ARRAY_SPECIFIER.sub(remove_spaces_in_specifier, md_text)
|
||||
|
||||
def _fix_brace_spacing(self, md_text: str) -> str:
|
||||
"""Fix spacing issues with braces in equation systems.
|
||||
|
||||
Removes whitespace and adds negative space for proper alignment in Word/OMML.
|
||||
"""
|
||||
# Fix \left\{ spacing
|
||||
md_text = re.sub(
|
||||
r"\\left\\\{\s+",
|
||||
r"\\left\\{\\!",
|
||||
md_text,
|
||||
)
|
||||
|
||||
# Fix \right\} spacing
|
||||
md_text = re.sub(
|
||||
r"\s+\\right\\\}",
|
||||
r"\\!\\right\\}",
|
||||
md_text,
|
||||
)
|
||||
|
||||
md_text = self._RE_LEFT_BRACE.sub(r"\\left\\{\\!", md_text)
|
||||
md_text = self._RE_RIGHT_BRACE.sub(r"\\!\\right\\}", md_text)
|
||||
return md_text
|
||||
|
||||
def _convert_special_environments(self, md_text: str) -> str:
|
||||
@@ -216,42 +554,28 @@ class Converter:
|
||||
|
||||
These environments have better rendering support in Word/OMML.
|
||||
"""
|
||||
# Pre-compiled pattern for alignment marker removal
|
||||
_re_align_marker = re.compile(r"(^|\\\\)\s*&")
|
||||
|
||||
def convert_cases(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right."
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\begin\{cases\}(.*?)\\end\{cases\}",
|
||||
convert_cases,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
md_text = self._RE_CASES.sub(convert_cases, md_text)
|
||||
|
||||
def convert_aligned_to_array(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
# Remove leading & alignment markers (not needed in array{l})
|
||||
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||
content = _re_align_marker.sub(r"\1", content)
|
||||
return r"\left\{\begin{array}{l}" + content + r"\end{array}\right."
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.",
|
||||
convert_aligned_to_array,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
md_text = self._RE_ALIGNED_BRACE.sub(convert_aligned_to_array, md_text)
|
||||
|
||||
def convert_standalone_aligned(match: re.Match) -> str:
|
||||
content = match.group(1)
|
||||
content = re.sub(r"(^|\\\\)\s*&", r"\1", content)
|
||||
content = _re_align_marker.sub(r"\1", content)
|
||||
return r"\begin{array}{l}" + content + r"\end{array}"
|
||||
|
||||
md_text = re.sub(
|
||||
r"\\begin\{aligned\}(.*?)\\end\{aligned\}",
|
||||
convert_standalone_aligned,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
md_text = self._RE_ALIGNED.sub(convert_standalone_aligned, md_text)
|
||||
|
||||
return md_text
|
||||
|
||||
@@ -259,36 +583,15 @@ class Converter:
|
||||
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
||||
|
||||
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
||||
use simple spacing (\quad) to push the equation number to the right side.
|
||||
The tag remains inside the formula for better compatibility.
|
||||
|
||||
Args:
|
||||
md_text: Markdown text containing LaTeX formulas with \\tag{}.
|
||||
|
||||
Returns:
|
||||
Markdown text with \\tag{} commands converted to spacing format.
|
||||
use simple spacing (\\quad) to push the equation number to the right side.
|
||||
"""
|
||||
|
||||
def convert_tag(match: re.Match) -> str:
|
||||
"""Convert a single \\tag{} command within a formula."""
|
||||
formula_content = match.group(1)
|
||||
tag_content = match.group(2)
|
||||
|
||||
# Replace \tag{...} with \quad (...) to push the number to the right
|
||||
# Keep it inside the formula for better Word compatibility
|
||||
return f"$${formula_content} \\quad ({tag_content})$$"
|
||||
|
||||
# Match display formulas ($$...$$) containing \\tag{...}
|
||||
# Pattern: $$...content...\\tag {?...}...$$
|
||||
# Allow optional space between \tag and {
|
||||
md_text = re.sub(
|
||||
r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
|
||||
convert_tag,
|
||||
md_text,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
return md_text
|
||||
return self._RE_TAG.sub(convert_tag, md_text)
|
||||
|
||||
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||||
"""Export markdown to docx or pdf file.
|
||||
@@ -381,4 +684,3 @@ class Converter:
|
||||
"""
|
||||
if os.path.exists(file_path):
|
||||
os.remove(file_path)
|
||||
|
||||
|
||||
@@ -17,13 +17,31 @@ settings = get_settings()
|
||||
|
||||
_COMMANDS_NEED_SPACE = {
|
||||
# operators / calculus
|
||||
"cdot", "times", "div", "pm", "mp",
|
||||
"int", "iint", "iiint", "oint", "sum", "prod", "lim",
|
||||
"cdot",
|
||||
"times",
|
||||
"div",
|
||||
"pm",
|
||||
"mp",
|
||||
"int",
|
||||
"iint",
|
||||
"iiint",
|
||||
"oint",
|
||||
"sum",
|
||||
"prod",
|
||||
"lim",
|
||||
# common functions
|
||||
"sin", "cos", "tan", "cot", "sec", "csc",
|
||||
"log", "ln", "exp",
|
||||
"sin",
|
||||
"cos",
|
||||
"tan",
|
||||
"cot",
|
||||
"sec",
|
||||
"csc",
|
||||
"log",
|
||||
"ln",
|
||||
"exp",
|
||||
# misc
|
||||
"partial", "nabla",
|
||||
"partial",
|
||||
"nabla",
|
||||
}
|
||||
|
||||
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
@@ -58,7 +76,7 @@ def _split_glued_command_token(token: str) -> str:
|
||||
if not best:
|
||||
return token
|
||||
|
||||
suffix = body[len(best):]
|
||||
suffix = body[len(best) :]
|
||||
if not suffix:
|
||||
return token
|
||||
|
||||
@@ -165,6 +183,7 @@ class OCRService(OCRServiceBase):
|
||||
"markdown": markdown_content,
|
||||
"latex": convert_result.latex,
|
||||
"mathml": convert_result.mathml,
|
||||
"mml": convert_result.mml,
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
||||
@@ -196,6 +215,7 @@ class OCRService(OCRServiceBase):
|
||||
return {
|
||||
"latex": convert_result.latex,
|
||||
"mathml": convert_result.mathml,
|
||||
"mml": convert_result.mml,
|
||||
"markdown": markdown_content,
|
||||
}
|
||||
except Exception as e:
|
||||
@@ -251,65 +271,60 @@ class MineruOCRService(OCRServiceBase):
|
||||
image = self.image_processor.add_padding(image)
|
||||
|
||||
# Convert numpy array to image bytes
|
||||
success, encoded_image = cv2.imencode('.png', image)
|
||||
success, encoded_image = cv2.imencode(".png", image)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
|
||||
image_bytes = BytesIO(encoded_image.tobytes())
|
||||
|
||||
# Prepare multipart form data
|
||||
files = {
|
||||
'files': ('image.png', image_bytes, 'image/png')
|
||||
}
|
||||
files = {"files": ("image.png", image_bytes, "image/png")}
|
||||
|
||||
data = {
|
||||
'return_middle_json': 'false',
|
||||
'return_model_output': 'false',
|
||||
'return_md': 'true',
|
||||
'return_images': 'false',
|
||||
'end_page_id': '99999',
|
||||
'start_page_id': '0',
|
||||
'lang_list': 'en',
|
||||
'server_url': 'string',
|
||||
'return_content_list': 'false',
|
||||
'backend': 'hybrid-auto-engine',
|
||||
'table_enable': 'true',
|
||||
'response_format_zip': 'false',
|
||||
'formula_enable': 'true',
|
||||
'parse_method': 'ocr'
|
||||
"return_middle_json": "false",
|
||||
"return_model_output": "false",
|
||||
"return_md": "true",
|
||||
"return_images": "false",
|
||||
"end_page_id": "99999",
|
||||
"start_page_id": "0",
|
||||
"lang_list": "en",
|
||||
"server_url": "string",
|
||||
"return_content_list": "false",
|
||||
"backend": "hybrid-auto-engine",
|
||||
"table_enable": "true",
|
||||
"response_format_zip": "false",
|
||||
"formula_enable": "true",
|
||||
"parse_method": "ocr",
|
||||
}
|
||||
|
||||
# Make API request
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
files=files,
|
||||
data=data,
|
||||
headers={'accept': 'application/json'},
|
||||
timeout=30
|
||||
)
|
||||
response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
|
||||
# Extract markdown content from response
|
||||
markdown_content = ""
|
||||
if 'results' in result and 'image' in result['results']:
|
||||
markdown_content = result['results']['image'].get('md_content', '')
|
||||
if "results" in result and "image" in result["results"]:
|
||||
markdown_content = result["results"]["image"].get("md_content", "")
|
||||
|
||||
# markdown_content = _postprocess_markdown(markdown_content)
|
||||
|
||||
# Convert to other formats if converter is available
|
||||
latex = ""
|
||||
mathml = ""
|
||||
mml = ""
|
||||
if self.converter and markdown_content:
|
||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||
latex = convert_result.latex
|
||||
mathml = convert_result.mathml
|
||||
mml = convert_result.mml
|
||||
|
||||
return {
|
||||
"markdown": markdown_content,
|
||||
"latex": latex,
|
||||
"mathml": mathml,
|
||||
"mml": mml,
|
||||
}
|
||||
|
||||
except requests.RequestException as e:
|
||||
@@ -318,8 +333,6 @@ class MineruOCRService(OCRServiceBase):
|
||||
raise RuntimeError(f"Recognition failed: {e}") from e
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mineru_service = MineruOCRService()
|
||||
image = cv2.imread("test/complex_formula.png")
|
||||
|
||||
@@ -26,7 +26,8 @@ dependencies = [
|
||||
"pypandoc==1.16.2",
|
||||
"paddlepaddle",
|
||||
"paddleocr[doc-parser]",
|
||||
"safetensors"
|
||||
"safetensors",
|
||||
"lxml>=5.0.0"
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
|
||||
102
test_array_fix.py
Normal file
102
test_array_fix.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Test script for array column specifier fix."""
|
||||
|
||||
from app.services.converter import Converter
|
||||
|
||||
|
||||
def test_array_specifier_fix():
|
||||
"""Test that array column specifiers with spaces are fixed."""
|
||||
|
||||
converter = Converter()
|
||||
|
||||
# The problematic LaTeX from the error
|
||||
latex_formula = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
|
||||
|
||||
print("Testing array column specifier fix")
|
||||
print("=" * 80)
|
||||
print(f"\nOriginal LaTeX (first 200 chars):\n{latex_formula[:200]}...")
|
||||
|
||||
# Test preprocessing
|
||||
print("\n" + "-" * 80)
|
||||
print("Step 1: Preprocessing")
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex_formula)
|
||||
|
||||
# Check if spaces were removed from array specifiers
|
||||
if "{c c c c}" in preprocessed:
|
||||
print("✗ FAILED: Spaces not removed from array specifiers")
|
||||
print(f"Found: {preprocessed[preprocessed.find('{c c c c}'):preprocessed.find('{c c c c}')+10]}")
|
||||
elif "{cccc}" in preprocessed:
|
||||
print("✓ SUCCESS: Spaces removed from array specifiers")
|
||||
print(f"Changed '{{{\"c c c c\"}}}' → '{{cccc}}'")
|
||||
else:
|
||||
print("? Could not find array specifier in preprocessed output")
|
||||
|
||||
# Test OMML conversion
|
||||
print("\n" + "-" * 80)
|
||||
print("Step 2: OMML Conversion")
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex_formula)
|
||||
print(f"✓ SUCCESS: OMML conversion completed")
|
||||
print(f"OMML length: {len(omml)} characters")
|
||||
print(f"OMML preview (first 300 chars):\n{omml[:300]}...")
|
||||
|
||||
# Check if it contains oMath element
|
||||
if "oMath" in omml:
|
||||
print("\n✓ Valid OMML: Contains oMath element")
|
||||
else:
|
||||
print("\n✗ WARNING: OMML might be incomplete (no oMath element found)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: OMML conversion error")
|
||||
print(f"Error: {e}")
|
||||
return False
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("✓ All tests passed!")
|
||||
return True
|
||||
|
||||
|
||||
def test_simple_array():
|
||||
"""Test with a simpler array example."""
|
||||
|
||||
converter = Converter()
|
||||
|
||||
print("\nTesting simple array")
|
||||
print("=" * 80)
|
||||
|
||||
# Simple array with spaces in column specifier
|
||||
latex_formula = r"\begin{array}{c c c} a & b & c \\ d & e & f \end{array}"
|
||||
|
||||
print(f"LaTeX: {latex_formula}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex_formula)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
print(f"Preview: {omml[:200]}...")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Array Column Specifier Fix Test Suite\n")
|
||||
|
||||
try:
|
||||
test1 = test_simple_array()
|
||||
test2 = test_array_specifier_fix()
|
||||
|
||||
if test1 and test2:
|
||||
print("\n" + "=" * 80)
|
||||
print("✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||
print("=" * 80)
|
||||
else:
|
||||
print("\n" + "=" * 80)
|
||||
print("✗✗✗ SOME TESTS FAILED ✗✗✗")
|
||||
print("=" * 80)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTests interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n\nTest suite error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
57
test_converter.py
Normal file
57
test_converter.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Test script for converter functionality."""
|
||||
|
||||
from app.services.converter import Converter
|
||||
|
||||
|
||||
def test_latex_only_conversion():
|
||||
"""Test conversion of LaTeX-only content."""
|
||||
converter = Converter()
|
||||
|
||||
# Test case 1: Display math with $$...$$
|
||||
latex_input = "$$E = mc^2$$"
|
||||
result = converter.convert_to_formats(latex_input)
|
||||
|
||||
print("Test 1: Display math ($$...$$)")
|
||||
print(f"Input: {latex_input}")
|
||||
print(f"LaTeX: {result.latex}")
|
||||
print(f"MathML: {result.mathml[:100]}...")
|
||||
print(f"MML: {result.mml[:100]}...")
|
||||
print(f"OMML: {result.omml[:100] if result.omml else 'Empty'}...")
|
||||
print()
|
||||
|
||||
# Test case 2: Inline math with $...$
|
||||
latex_input2 = "$\\frac{a}{b}$"
|
||||
result2 = converter.convert_to_formats(latex_input2)
|
||||
|
||||
print("Test 2: Inline math ($...$)")
|
||||
print(f"Input: {latex_input2}")
|
||||
print(f"LaTeX: {result2.latex}")
|
||||
print(f"MathML: {result2.mathml[:100]}...")
|
||||
print()
|
||||
|
||||
# Test case 3: Complex formula
|
||||
latex_input3 = "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$"
|
||||
result3 = converter.convert_to_formats(latex_input3)
|
||||
|
||||
print("Test 3: Complex formula")
|
||||
print(f"Input: {latex_input3}")
|
||||
print(f"LaTeX: {result3.latex}")
|
||||
print(f"MathML: {result3.mathml[:150]}...")
|
||||
print(f"OMML length: {len(result3.omml)}")
|
||||
print()
|
||||
|
||||
# Test case 4: Regular markdown (not LaTeX-only)
|
||||
markdown_input = "# Hello\n\nThis is a test with math: $x = 2$"
|
||||
result4 = converter.convert_to_formats(markdown_input)
|
||||
|
||||
print("Test 4: Regular markdown")
|
||||
print(f"Input: {markdown_input}")
|
||||
print(f"LaTeX: {result4.latex[:100]}...")
|
||||
print(f"MathML: {result4.mathml[:100]}...")
|
||||
print(f"MML: {result4.mml}")
|
||||
print(f"OMML: {result4.omml}")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_latex_only_conversion()
|
||||
112
test_omml_api.py
Normal file
112
test_omml_api.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""Test script for OMML conversion API endpoint."""
|
||||
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
def test_latex_to_omml():
|
||||
"""Test the /convert/latex-to-omml endpoint."""
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
{
|
||||
"name": "Simple fraction",
|
||||
"latex": "\\frac{a}{b}",
|
||||
},
|
||||
{
|
||||
"name": "Quadratic formula",
|
||||
"latex": "x = \\frac{-b \\pm \\sqrt{b^2 - 4ac}}{2a}",
|
||||
},
|
||||
{
|
||||
"name": "Integral",
|
||||
"latex": "\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}",
|
||||
},
|
||||
{
|
||||
"name": "Matrix",
|
||||
"latex": "\\begin{matrix} a & b \\\\ c & d \\end{matrix}",
|
||||
},
|
||||
]
|
||||
|
||||
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
|
||||
|
||||
print("Testing OMML Conversion API")
|
||||
print("=" * 80)
|
||||
|
||||
for i, test_case in enumerate(test_cases, 1):
|
||||
print(f"\nTest {i}: {test_case['name']}")
|
||||
print("-" * 80)
|
||||
print(f"LaTeX: {test_case['latex']}")
|
||||
|
||||
try:
|
||||
response = requests.post(
|
||||
base_url,
|
||||
json={"latex": test_case["latex"]},
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
omml = result.get("omml", "")
|
||||
|
||||
print(f"✓ Status: {response.status_code}")
|
||||
print(f"OMML length: {len(omml)} characters")
|
||||
print(f"OMML preview: {omml[:150]}...")
|
||||
|
||||
else:
|
||||
print(f"✗ Status: {response.status_code}")
|
||||
print(f"Error: {response.text}")
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"✗ Request failed: {e}")
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
|
||||
def test_invalid_input():
|
||||
"""Test error handling with invalid input."""
|
||||
|
||||
print("\nTesting Error Handling")
|
||||
print("=" * 80)
|
||||
|
||||
base_url = "http://localhost:8000/api/v1/convert/latex-to-omml"
|
||||
|
||||
# Empty LaTeX
|
||||
print("\nTest: Empty LaTeX")
|
||||
response = requests.post(
|
||||
base_url,
|
||||
json={"latex": ""},
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
print(f"Status: {response.status_code}")
|
||||
print(f"Response: {response.json()}")
|
||||
|
||||
# Missing LaTeX field
|
||||
print("\nTest: Missing LaTeX field")
|
||||
response = requests.post(
|
||||
base_url,
|
||||
json={},
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
print(f"Status: {response.status_code}")
|
||||
print(f"Response: {response.json()}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("OMML API Test Suite")
|
||||
print("Make sure the API server is running on http://localhost:8000")
|
||||
print()
|
||||
|
||||
try:
|
||||
test_latex_to_omml()
|
||||
test_invalid_input()
|
||||
print("\n✓ All tests completed!")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n✗ Tests interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n✗ Test suite failed: {e}")
|
||||
218
test_omml_preprocessing.py
Normal file
218
test_omml_preprocessing.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""Comprehensive test for OMML conversion with preprocessing."""
|
||||
|
||||
from app.services.converter import Converter
|
||||
|
||||
|
||||
def test_case_1_array_with_spaces():
|
||||
"""Test: Array with spaces in column specifier (the original issue)."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 1: Array with spaces in column specifier")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
# The problematic LaTeX from the error
|
||||
latex = r"""\begin{array}{l} D = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} + 0 + \dots + 0 & 0 + a _ {i 2} + \dots + 0 & \dots & 0 + \dots + 0 + a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ = \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ a _ {i 1} & 0 & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & a _ {i 2} & \dots & 0 \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right| \\ + \dots + \left| \begin{array}{c c c c} a _ {1 1} & a _ {1 2} & \dots & a _ {1 n} \\ \vdots & \vdots & & \vdots \\ 0 & 0 & \dots & a _ {i n} \\ \vdots & \vdots & & \vdots \\ a _ {n 1} & a _ {n 2} & \dots & a _ {n n} \end{array} \right|, \\ \end{array}"""
|
||||
|
||||
print(f"LaTeX length: {len(latex)} chars")
|
||||
print(f"Preview: {latex[:100]}...")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"\n✓ SUCCESS: Converted to OMML")
|
||||
print(f"OMML length: {len(omml)} chars")
|
||||
|
||||
if "oMath" in omml:
|
||||
print("✓ Valid OMML structure detected")
|
||||
|
||||
# Check preprocessing worked
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex)
|
||||
if "{c c c c}" not in preprocessed and "{cccc}" in preprocessed:
|
||||
print("✓ Array column specifiers fixed: '{c c c c}' → '{cccc}'")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_case_2_vmatrix():
|
||||
"""Test: vmatrix environment conversion."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 2: vmatrix environment")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
latex = r"\begin{vmatrix} a & b \\ c & d \end{vmatrix}"
|
||||
print(f"LaTeX: {latex}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
|
||||
# Check if vmatrix was converted
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex)
|
||||
if "vmatrix" not in preprocessed and r"\left|" in preprocessed:
|
||||
print("✓ vmatrix converted to \\left| ... \\right|")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_case_3_cases_environment():
|
||||
"""Test: cases environment conversion."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 3: cases environment")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
latex = r"f(x) = \begin{cases} x^2 & x \geq 0 \\ -x & x < 0 \end{cases}"
|
||||
print(f"LaTeX: {latex}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
|
||||
# Check if cases was converted to array
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex)
|
||||
if "cases" not in preprocessed and "array" in preprocessed:
|
||||
print("✓ cases converted to array environment")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_case_4_aligned_environment():
|
||||
"""Test: aligned environment conversion."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 4: aligned environment")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
latex = r"\begin{aligned} x + y &= 5 \\ 2x - y &= 1 \end{aligned}"
|
||||
print(f"LaTeX: {latex}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
|
||||
# Check if aligned was converted
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex)
|
||||
if "aligned" not in preprocessed and "array" in preprocessed:
|
||||
print("✓ aligned converted to array environment")
|
||||
if "&" not in preprocessed or preprocessed.count("&") < latex.count("&"):
|
||||
print("✓ Alignment markers removed")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_case_5_simple_formula():
|
||||
"""Test: Simple formula (should work without preprocessing)."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 5: Simple formula")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
latex = r"x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}"
|
||||
print(f"LaTeX: {latex}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_case_6_nested_structures():
|
||||
"""Test: Nested structures with multiple issues."""
|
||||
print("\n" + "=" * 80)
|
||||
print("Test 6: Nested structures")
|
||||
print("=" * 80)
|
||||
|
||||
converter = Converter()
|
||||
|
||||
latex = r"\left\{ \begin{array}{l c} \begin{vmatrix} a & b \\ c & d \end{vmatrix} & = ad - bc \\ f(x) = \begin{cases} 1 & x > 0 \\ 0 & x \leq 0 \end{cases} & \text{step function} \end{array} \right."
|
||||
print(f"LaTeX: {latex}")
|
||||
|
||||
try:
|
||||
omml = converter.convert_to_omml(latex)
|
||||
print(f"✓ SUCCESS: Converted to OMML ({len(omml)} chars)")
|
||||
|
||||
preprocessed = converter._preprocess_formula_for_omml(latex)
|
||||
print("\nPreprocessing applied:")
|
||||
if "vmatrix" not in preprocessed:
|
||||
print(" ✓ vmatrix converted")
|
||||
if "cases" not in preprocessed:
|
||||
print(" ✓ cases converted")
|
||||
if "{l c}" not in preprocessed and "{lc}" in preprocessed:
|
||||
print(" ✓ Array specifiers fixed")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ FAILED: {e}")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 80)
|
||||
print("OMML CONVERSION TEST SUITE")
|
||||
print("Testing preprocessing and conversion")
|
||||
print("=" * 80)
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
results.append(("Simple formula", test_case_5_simple_formula()))
|
||||
results.append(("Array with spaces", test_case_1_array_with_spaces()))
|
||||
results.append(("vmatrix", test_case_2_vmatrix()))
|
||||
results.append(("cases", test_case_3_cases_environment()))
|
||||
results.append(("aligned", test_case_4_aligned_environment()))
|
||||
results.append(("Nested structures", test_case_6_nested_structures()))
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("TEST SUMMARY")
|
||||
print("=" * 80)
|
||||
|
||||
passed = sum(1 for _, result in results if result)
|
||||
total = len(results)
|
||||
|
||||
for name, result in results:
|
||||
status = "✓ PASS" if result else "✗ FAIL"
|
||||
print(f"{status}: {name}")
|
||||
|
||||
print("\n" + "-" * 80)
|
||||
print(f"Total: {passed}/{total} tests passed")
|
||||
|
||||
if passed == total:
|
||||
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||
else:
|
||||
print(f"\n✗✗✗ {total - passed} TESTS FAILED ✗✗✗")
|
||||
|
||||
print("=" * 80)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTests interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n\nTest suite error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
Reference in New Issue
Block a user