Compare commits
4 Commits
35928c2484
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 10dbd59161 | |||
| df2b664af4 | |||
| 6ea37c9380 | |||
| 3870c108b2 |
@@ -2,11 +2,11 @@
|
|||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
|
||||||
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service
|
from app.core.dependencies import get_image_processor, get_layout_detector, get_ocr_service, get_mineru_ocr_service
|
||||||
from app.schemas.image import ImageOCRRequest, ImageOCRResponse
|
from app.schemas.image import ImageOCRRequest, ImageOCRResponse
|
||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
from app.services.ocr_service import OCRService
|
from app.services.ocr_service import OCRService, MineruOCRService
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
@@ -16,7 +16,8 @@ async def process_image_ocr(
|
|||||||
request: ImageOCRRequest,
|
request: ImageOCRRequest,
|
||||||
image_processor: ImageProcessor = Depends(get_image_processor),
|
image_processor: ImageProcessor = Depends(get_image_processor),
|
||||||
layout_detector: LayoutDetector = Depends(get_layout_detector),
|
layout_detector: LayoutDetector = Depends(get_layout_detector),
|
||||||
ocr_service: OCRService = Depends(get_ocr_service),
|
mineru_service: MineruOCRService = Depends(get_mineru_ocr_service),
|
||||||
|
paddle_service: OCRService = Depends(get_ocr_service),
|
||||||
) -> ImageOCRResponse:
|
) -> ImageOCRResponse:
|
||||||
"""Process an image and extract content as LaTeX, Markdown, and MathML.
|
"""Process an image and extract content as LaTeX, Markdown, and MathML.
|
||||||
|
|
||||||
@@ -35,12 +36,15 @@ async def process_image_ocr(
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 3. Perform OCR based on layout
|
if request.model_name == "mineru":
|
||||||
ocr_result = ocr_service.recognize(image)
|
ocr_result = mineru_service.recognize(image)
|
||||||
|
elif request.model_name == "paddle":
|
||||||
|
ocr_result = paddle_service.recognize(image)
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid model name")
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
raise HTTPException(status_code=503, detail=str(e))
|
raise HTTPException(status_code=503, detail=str(e))
|
||||||
|
|
||||||
# 4. Return response
|
|
||||||
return ImageOCRResponse(
|
return ImageOCRResponse(
|
||||||
latex=ocr_result.get("latex", ""),
|
latex=ocr_result.get("latex", ""),
|
||||||
markdown=ocr_result.get("markdown", ""),
|
markdown=ocr_result.get("markdown", ""),
|
||||||
|
|||||||
@@ -23,6 +23,9 @@ class Settings(BaseSettings):
|
|||||||
|
|
||||||
# PaddleOCR-VL Settings
|
# PaddleOCR-VL Settings
|
||||||
paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
|
paddleocr_vl_url: str = "http://127.0.0.1:8000/v1"
|
||||||
|
|
||||||
|
# MinerOCR Settings
|
||||||
|
miner_ocr_api_url: str = "http://127.0.0.1:8000/file_parse"
|
||||||
|
|
||||||
# Model Paths
|
# Model Paths
|
||||||
pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"
|
pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2"
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
from app.services.ocr_service import OCRService
|
from app.services.ocr_service import OCRService, MineruOCRService
|
||||||
from app.services.converter import Converter
|
from app.services.converter import Converter
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
|
|
||||||
@@ -45,3 +45,14 @@ def get_converter() -> Converter:
|
|||||||
"""Get a DOCX converter instance."""
|
"""Get a DOCX converter instance."""
|
||||||
return Converter()
|
return Converter()
|
||||||
|
|
||||||
|
|
||||||
|
def get_mineru_ocr_service() -> MineruOCRService:
|
||||||
|
"""Get a MinerOCR service instance."""
|
||||||
|
settings = get_settings()
|
||||||
|
api_url = getattr(settings, 'miner_ocr_api_url', 'http://127.0.0.1:8000/file_parse')
|
||||||
|
return MineruOCRService(
|
||||||
|
api_url=api_url,
|
||||||
|
converter=get_converter(),
|
||||||
|
image_processor=get_image_processor(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ app = FastAPI(
|
|||||||
app.include_router(api_router, prefix=settings.api_prefix)
|
app.include_router(api_router, prefix=settings.api_prefix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
"""Health check endpoint."""
|
"""Health check endpoint."""
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ class ImageOCRRequest(BaseModel):
|
|||||||
|
|
||||||
image_url: str | None = Field(None, description="URL to fetch the image from")
|
image_url: str | None = Field(None, description="URL to fetch the image from")
|
||||||
image_base64: str | None = Field(None, description="Base64-encoded image data")
|
image_base64: str | None = Field(None, description="Base64-encoded image data")
|
||||||
|
model_name: str = Field("mineru", description="Name of the model to use for OCR")
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def validate_input(self):
|
def validate_input(self):
|
||||||
|
|||||||
@@ -122,12 +122,18 @@ class Converter:
|
|||||||
# Convert matrix environments for better Word rendering
|
# Convert matrix environments for better Word rendering
|
||||||
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
cleaned_md = self._convert_matrix_environments(cleaned_md)
|
||||||
|
|
||||||
|
# Fix array environment column specifiers (remove spaces)
|
||||||
|
cleaned_md = self._fix_array_column_specifiers(cleaned_md)
|
||||||
|
|
||||||
# Fix brace spacing for equation systems
|
# Fix brace spacing for equation systems
|
||||||
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
cleaned_md = self._fix_brace_spacing(cleaned_md)
|
||||||
|
|
||||||
# Convert cases and aligned environments
|
# Convert cases and aligned environments
|
||||||
cleaned_md = self._convert_special_environments(cleaned_md)
|
cleaned_md = self._convert_special_environments(cleaned_md)
|
||||||
|
|
||||||
|
# Handle LaTeX \tag{} commands for equation numbering
|
||||||
|
cleaned_md = self._convert_tag_commands(cleaned_md)
|
||||||
|
|
||||||
return cleaned_md
|
return cleaned_md
|
||||||
|
|
||||||
def _convert_matrix_environments(self, md_text: str) -> str:
|
def _convert_matrix_environments(self, md_text: str) -> str:
|
||||||
@@ -153,6 +159,37 @@ class Converter:
|
|||||||
|
|
||||||
return md_text
|
return md_text
|
||||||
|
|
||||||
|
def _fix_array_column_specifiers(self, md_text: str) -> str:
|
||||||
|
"""Fix array environment column specifiers by removing spaces.
|
||||||
|
|
||||||
|
Pandoc's OMML converter doesn't accept spaces between column alignment
|
||||||
|
specifiers in array environments. This converts patterns like
|
||||||
|
{c c c c} to {cccc}.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_text: Markdown text with LaTeX formulas.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown text with fixed array column specifiers.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def remove_spaces_in_specifier(match: re.Match) -> str:
|
||||||
|
"""Remove spaces from column specifier."""
|
||||||
|
specifier = match.group(1)
|
||||||
|
# Remove all spaces from the specifier
|
||||||
|
specifier_no_spaces = re.sub(r"\s+", "", specifier)
|
||||||
|
return f"\\begin{{array}}{{{specifier_no_spaces}}}"
|
||||||
|
|
||||||
|
# Match \begin{array}{...} and remove spaces in the column specifier
|
||||||
|
# Pattern: \begin{array}{c c c ...} -> \begin{array}{ccc...}
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\\begin\{array\}\{([^}]+)\}",
|
||||||
|
remove_spaces_in_specifier,
|
||||||
|
md_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_text
|
||||||
|
|
||||||
def _fix_brace_spacing(self, md_text: str) -> str:
|
def _fix_brace_spacing(self, md_text: str) -> str:
|
||||||
"""Fix spacing issues with braces in equation systems.
|
"""Fix spacing issues with braces in equation systems.
|
||||||
|
|
||||||
@@ -218,6 +255,41 @@ class Converter:
|
|||||||
|
|
||||||
return md_text
|
return md_text
|
||||||
|
|
||||||
|
def _convert_tag_commands(self, md_text: str) -> str:
|
||||||
|
"""Convert LaTeX \\tag{} commands to Word-compatible format.
|
||||||
|
|
||||||
|
The \\tag{} command is not supported in Word OMML format, so we convert it to
|
||||||
|
use simple spacing (\quad) to push the equation number to the right side.
|
||||||
|
The tag remains inside the formula for better compatibility.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
md_text: Markdown text containing LaTeX formulas with \\tag{}.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown text with \\tag{} commands converted to spacing format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def convert_tag(match: re.Match) -> str:
|
||||||
|
"""Convert a single \\tag{} command within a formula."""
|
||||||
|
formula_content = match.group(1)
|
||||||
|
tag_content = match.group(2)
|
||||||
|
|
||||||
|
# Replace \tag{...} with \quad (...) to push the number to the right
|
||||||
|
# Keep it inside the formula for better Word compatibility
|
||||||
|
return f"$${formula_content} \\quad ({tag_content})$$"
|
||||||
|
|
||||||
|
# Match display formulas ($$...$$) containing \\tag{...}
|
||||||
|
# Pattern: $$...content...\\tag {?...}...$$
|
||||||
|
# Allow optional space between \tag and {
|
||||||
|
md_text = re.sub(
|
||||||
|
r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$",
|
||||||
|
convert_tag,
|
||||||
|
md_text,
|
||||||
|
flags=re.DOTALL,
|
||||||
|
)
|
||||||
|
|
||||||
|
return md_text
|
||||||
|
|
||||||
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes:
|
||||||
"""Export markdown to docx or pdf file.
|
"""Export markdown to docx or pdf file.
|
||||||
|
|
||||||
|
|||||||
@@ -25,6 +25,38 @@ class ImageProcessor:
|
|||||||
"""
|
"""
|
||||||
self.padding_ratio = padding_ratio or settings.image_padding_ratio
|
self.padding_ratio = padding_ratio or settings.image_padding_ratio
|
||||||
|
|
||||||
|
def _convert_to_bgr(self, pil_image: Image.Image) -> np.ndarray:
|
||||||
|
"""Convert PIL Image to BGR numpy array, handling alpha channel.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pil_image: PIL Image object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Image as numpy array in BGR format.
|
||||||
|
"""
|
||||||
|
# Handle RGBA images (PNG with transparency)
|
||||||
|
if pil_image.mode == "RGBA":
|
||||||
|
# Create white background and paste image on top
|
||||||
|
background = Image.new("RGB", pil_image.size, (255, 255, 255))
|
||||||
|
background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha as mask
|
||||||
|
pil_image = background
|
||||||
|
elif pil_image.mode == "LA":
|
||||||
|
# Grayscale with alpha
|
||||||
|
background = Image.new("L", pil_image.size, 255)
|
||||||
|
background.paste(pil_image, mask=pil_image.split()[1])
|
||||||
|
pil_image = background.convert("RGB")
|
||||||
|
elif pil_image.mode == "P":
|
||||||
|
# Palette mode, may have transparency
|
||||||
|
pil_image = pil_image.convert("RGBA")
|
||||||
|
background = Image.new("RGB", pil_image.size, (255, 255, 255))
|
||||||
|
background.paste(pil_image, mask=pil_image.split()[3])
|
||||||
|
pil_image = background
|
||||||
|
elif pil_image.mode != "RGB":
|
||||||
|
# Convert other modes to RGB
|
||||||
|
pil_image = pil_image.convert("RGB")
|
||||||
|
|
||||||
|
return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
|
||||||
|
|
||||||
def load_image_from_url(self, url: str) -> np.ndarray:
|
def load_image_from_url(self, url: str) -> np.ndarray:
|
||||||
"""Load image from URL.
|
"""Load image from URL.
|
||||||
|
|
||||||
@@ -40,8 +72,8 @@ class ImageProcessor:
|
|||||||
try:
|
try:
|
||||||
with urlopen(url, timeout=30) as response:
|
with urlopen(url, timeout=30) as response:
|
||||||
image_data = response.read()
|
image_data = response.read()
|
||||||
image = Image.open(io.BytesIO(image_data))
|
pil_image = Image.open(io.BytesIO(image_data))
|
||||||
return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
return self._convert_to_bgr(pil_image)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Failed to load image from URL: {e}") from e
|
raise ValueError(f"Failed to load image from URL: {e}") from e
|
||||||
|
|
||||||
@@ -63,8 +95,8 @@ class ImageProcessor:
|
|||||||
base64_str = base64_str.split(",", 1)[1]
|
base64_str = base64_str.split(",", 1)[1]
|
||||||
|
|
||||||
image_data = base64.b64decode(base64_str)
|
image_data = base64.b64decode(base64_str)
|
||||||
image = Image.open(io.BytesIO(image_data))
|
pil_image = Image.open(io.BytesIO(image_data))
|
||||||
return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
return self._convert_to_bgr(pil_image)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Failed to decode base64 image: {e}") from e
|
raise ValueError(f"Failed to decode base64 image: {e}") from e
|
||||||
|
|
||||||
|
|||||||
@@ -140,18 +140,39 @@ class LayoutDetector:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import cv2
|
import cv2
|
||||||
|
from app.core.config import get_settings
|
||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
|
from app.services.converter import Converter
|
||||||
|
from app.services.ocr_service import OCRService
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# Initialize dependencies
|
||||||
layout_detector = LayoutDetector()
|
layout_detector = LayoutDetector()
|
||||||
image_path = "test/timeout.png"
|
image_processor = ImageProcessor(padding_ratio=settings.image_padding_ratio)
|
||||||
|
converter = Converter()
|
||||||
|
|
||||||
|
# Initialize OCR service
|
||||||
|
ocr_service = OCRService(
|
||||||
|
vl_server_url=settings.paddleocr_vl_url,
|
||||||
|
layout_detector=layout_detector,
|
||||||
|
image_processor=image_processor,
|
||||||
|
converter=converter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load test image
|
||||||
|
image_path = "test/complex_formula.png"
|
||||||
image = cv2.imread(image_path)
|
image = cv2.imread(image_path)
|
||||||
image_processor = ImageProcessor(padding_ratio=0.15)
|
|
||||||
image = image_processor.add_padding(image)
|
if image is None:
|
||||||
|
print(f"Failed to load image: {image_path}")
|
||||||
# Save the padded image for debugging
|
else:
|
||||||
cv2.imwrite("debug_padded_image.png", image)
|
print(f"Image loaded: {image.shape}")
|
||||||
|
|
||||||
|
# Run OCR recognition
|
||||||
layout_info = layout_detector.detect(image)
|
result = ocr_service.recognize(image)
|
||||||
print(layout_info)
|
|
||||||
|
print("\n=== OCR Result ===")
|
||||||
|
print(f"Markdown:\n{result['markdown']}")
|
||||||
|
print(f"\nLaTeX:\n{result['latex']}")
|
||||||
|
print(f"\nMathML:\n{result['mathml']}")
|
||||||
@@ -1,17 +1,103 @@
|
|||||||
"""PaddleOCR-VL client service for text and formula recognition."""
|
"""PaddleOCR-VL client service for text and formula recognition."""
|
||||||
|
|
||||||
|
import re
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
import requests
|
||||||
|
from io import BytesIO
|
||||||
from app.core.config import get_settings
|
from app.core.config import get_settings
|
||||||
from paddleocr import PaddleOCRVL
|
from paddleocr import PaddleOCRVL
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from app.services.layout_detector import LayoutDetector
|
from app.services.layout_detector import LayoutDetector
|
||||||
from app.services.image_processor import ImageProcessor
|
from app.services.image_processor import ImageProcessor
|
||||||
from app.services.converter import Converter
|
from app.services.converter import Converter
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
|
|
||||||
|
_COMMANDS_NEED_SPACE = {
|
||||||
|
# operators / calculus
|
||||||
|
"cdot", "times", "div", "pm", "mp",
|
||||||
|
"int", "iint", "iiint", "oint", "sum", "prod", "lim",
|
||||||
|
# common functions
|
||||||
|
"sin", "cos", "tan", "cot", "sec", "csc",
|
||||||
|
"log", "ln", "exp",
|
||||||
|
# misc
|
||||||
|
"partial", "nabla",
|
||||||
|
}
|
||||||
|
|
||||||
class OCRService:
|
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||||
|
_COMMAND_TOKEN_PATTERN = re.compile(r"\\[a-zA-Z]+")
|
||||||
|
|
||||||
|
# stage2: differentials inside math segments
|
||||||
|
_DIFFERENTIAL_UPPER_PATTERN = re.compile(r"(?<!\\)d([A-Z])")
|
||||||
|
_DIFFERENTIAL_LOWER_PATTERN = re.compile(r"(?<!\\)d([a-z])")
|
||||||
|
|
||||||
|
|
||||||
|
def _split_glued_command_token(token: str) -> str:
|
||||||
|
"""Split OCR-glued LaTeX command token by whitelist longest-prefix.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- \\cdotdS -> \\cdot dS
|
||||||
|
- \\intdx -> \\int dx
|
||||||
|
"""
|
||||||
|
if not token.startswith("\\"):
|
||||||
|
return token
|
||||||
|
|
||||||
|
body = token[1:]
|
||||||
|
if len(body) < 2:
|
||||||
|
return token
|
||||||
|
|
||||||
|
best = None
|
||||||
|
# longest prefix that is in whitelist
|
||||||
|
for i in range(1, len(body)):
|
||||||
|
prefix = body[:i]
|
||||||
|
if prefix in _COMMANDS_NEED_SPACE:
|
||||||
|
best = prefix
|
||||||
|
|
||||||
|
if not best:
|
||||||
|
return token
|
||||||
|
|
||||||
|
suffix = body[len(best):]
|
||||||
|
if not suffix:
|
||||||
|
return token
|
||||||
|
|
||||||
|
return f"\\{best} {suffix}"
|
||||||
|
|
||||||
|
|
||||||
|
def _postprocess_math(expr: str) -> str:
|
||||||
|
"""Postprocess a *math* expression (already inside $...$ or $$...$$)."""
|
||||||
|
# stage1: split glued command tokens (e.g. \cdotdS)
|
||||||
|
expr = _COMMAND_TOKEN_PATTERN.sub(lambda m: _split_glued_command_token(m.group(0)), expr)
|
||||||
|
# stage2: normalize differentials (keep conservative)
|
||||||
|
expr = _DIFFERENTIAL_UPPER_PATTERN.sub(r"\\mathrm{d} \1", expr)
|
||||||
|
expr = _DIFFERENTIAL_LOWER_PATTERN.sub(r"d \1", expr)
|
||||||
|
return expr
|
||||||
|
|
||||||
|
|
||||||
|
def _postprocess_markdown(markdown_content: str) -> str:
|
||||||
|
"""Apply LaTeX postprocessing only within $...$ / $$...$$ segments."""
|
||||||
|
if not markdown_content:
|
||||||
|
return markdown_content
|
||||||
|
|
||||||
|
def _fix_segment(m: re.Match) -> str:
|
||||||
|
seg = m.group(0)
|
||||||
|
if seg.startswith("$$") and seg.endswith("$$"):
|
||||||
|
return f"$${_postprocess_math(seg[2:-2])}$$"
|
||||||
|
if seg.startswith("$") and seg.endswith("$"):
|
||||||
|
return f"${_postprocess_math(seg[1:-1])}$"
|
||||||
|
return seg
|
||||||
|
|
||||||
|
return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
||||||
|
|
||||||
|
|
||||||
|
class OCRServiceBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def recognize(self, image: np.ndarray) -> dict:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class OCRService(OCRServiceBase):
|
||||||
"""Service for OCR using PaddleOCR-VL."""
|
"""Service for OCR using PaddleOCR-VL."""
|
||||||
|
|
||||||
_pipeline: Optional[PaddleOCRVL] = None
|
_pipeline: Optional[PaddleOCRVL] = None
|
||||||
@@ -35,6 +121,7 @@ class OCRService:
|
|||||||
self.layout_detector = layout_detector
|
self.layout_detector = layout_detector
|
||||||
self.image_processor = image_processor
|
self.image_processor = image_processor
|
||||||
self.converter = converter
|
self.converter = converter
|
||||||
|
|
||||||
def _get_pipeline(self):
|
def _get_pipeline(self):
|
||||||
"""Get or create PaddleOCR-VL pipeline.
|
"""Get or create PaddleOCR-VL pipeline.
|
||||||
|
|
||||||
@@ -49,7 +136,7 @@ class OCRService:
|
|||||||
)
|
)
|
||||||
return OCRService._pipeline
|
return OCRService._pipeline
|
||||||
|
|
||||||
def recognize_mixed(self, image: np.ndarray) -> dict:
|
def _recognize_mixed(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
"""Recognize mixed content (text + formulas) using PP-DocLayoutV2.
|
||||||
|
|
||||||
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
|
This mode uses PaddleOCR-VL with PP-DocLayoutV2 for document-aware
|
||||||
@@ -71,6 +158,7 @@ class OCRService:
|
|||||||
for res in output:
|
for res in output:
|
||||||
markdown_content += res.markdown.get("markdown_texts", "")
|
markdown_content += res.markdown.get("markdown_texts", "")
|
||||||
|
|
||||||
|
markdown_content = _postprocess_markdown(markdown_content)
|
||||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -81,7 +169,7 @@ class OCRService:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
||||||
|
|
||||||
def recognize_formula(self, image: np.ndarray) -> dict:
|
def _recognize_formula(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize formula/math content using PaddleOCR-VL with prompt.
|
"""Recognize formula/math content using PaddleOCR-VL with prompt.
|
||||||
|
|
||||||
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
|
This mode uses PaddleOCR-VL directly with a formula recognition prompt.
|
||||||
@@ -102,6 +190,7 @@ class OCRService:
|
|||||||
for res in output:
|
for res in output:
|
||||||
markdown_content += res.markdown.get("markdown_texts", "")
|
markdown_content += res.markdown.get("markdown_texts", "")
|
||||||
|
|
||||||
|
markdown_content = _postprocess_markdown(markdown_content)
|
||||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -124,18 +213,116 @@ class OCRService:
|
|||||||
padded_image = self.image_processor.add_padding(image)
|
padded_image = self.image_processor.add_padding(image)
|
||||||
layout_info = self.layout_detector.detect(padded_image)
|
layout_info = self.layout_detector.detect(padded_image)
|
||||||
if layout_info.MixedRecognition:
|
if layout_info.MixedRecognition:
|
||||||
return self.recognize_mixed(image)
|
return self._recognize_mixed(image)
|
||||||
else:
|
else:
|
||||||
return self.recognize_formula(image)
|
return self._recognize_formula(image)
|
||||||
|
|
||||||
|
|
||||||
|
class MineruOCRService(OCRServiceBase):
|
||||||
|
"""Service for OCR using local file_parse API."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_url: str = "http://127.0.0.1:8000/file_parse",
|
||||||
|
image_processor: Optional[ImageProcessor] = None,
|
||||||
|
converter: Optional[Converter] = None,
|
||||||
|
):
|
||||||
|
"""Initialize Local API service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_url: URL of the local file_parse API endpoint.
|
||||||
|
converter: Optional converter instance for format conversion.
|
||||||
|
"""
|
||||||
|
self.api_url = api_url
|
||||||
|
self.image_processor = image_processor
|
||||||
|
self.converter = converter
|
||||||
|
|
||||||
|
def recognize(self, image: np.ndarray) -> dict:
|
||||||
|
"""Recognize content using local file_parse API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Input image as numpy array in BGR format.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'markdown', 'latex', 'mathml' keys.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if self.image_processor:
|
||||||
|
image = self.image_processor.add_padding(image)
|
||||||
|
|
||||||
|
# Convert numpy array to image bytes
|
||||||
|
success, encoded_image = cv2.imencode('.png', image)
|
||||||
|
if not success:
|
||||||
|
raise RuntimeError("Failed to encode image")
|
||||||
|
|
||||||
|
image_bytes = BytesIO(encoded_image.tobytes())
|
||||||
|
|
||||||
|
# Prepare multipart form data
|
||||||
|
files = {
|
||||||
|
'files': ('image.png', image_bytes, 'image/png')
|
||||||
|
}
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'return_middle_json': 'false',
|
||||||
|
'return_model_output': 'false',
|
||||||
|
'return_md': 'true',
|
||||||
|
'return_images': 'false',
|
||||||
|
'end_page_id': '99999',
|
||||||
|
'start_page_id': '0',
|
||||||
|
'lang_list': 'en',
|
||||||
|
'server_url': 'string',
|
||||||
|
'return_content_list': 'false',
|
||||||
|
'backend': 'hybrid-auto-engine',
|
||||||
|
'table_enable': 'true',
|
||||||
|
'response_format_zip': 'false',
|
||||||
|
'formula_enable': 'true',
|
||||||
|
'parse_method': 'ocr'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make API request
|
||||||
|
response = requests.post(
|
||||||
|
self.api_url,
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers={'accept': 'application/json'},
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
# Extract markdown content from response
|
||||||
|
markdown_content = ""
|
||||||
|
if 'results' in result and 'image' in result['results']:
|
||||||
|
markdown_content = result['results']['image'].get('md_content', '')
|
||||||
|
|
||||||
|
# markdown_content = _postprocess_markdown(markdown_content)
|
||||||
|
|
||||||
|
# Convert to other formats if converter is available
|
||||||
|
latex = ""
|
||||||
|
mathml = ""
|
||||||
|
if self.converter and markdown_content:
|
||||||
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
|
latex = convert_result.latex
|
||||||
|
mathml = convert_result.mathml
|
||||||
|
|
||||||
|
return {
|
||||||
|
"markdown": markdown_content,
|
||||||
|
"latex": latex,
|
||||||
|
"mathml": mathml,
|
||||||
|
}
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
raise RuntimeError(f"Local API request failed: {e}") from e
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Recognition failed: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import cv2
|
mineru_service = MineruOCRService()
|
||||||
from app.services.image_processor import ImageProcessor
|
image = cv2.imread("test/complex_formula.png")
|
||||||
from app.services.layout_detector import LayoutDetector
|
image_numpy = np.array(image)
|
||||||
image_processor = ImageProcessor(padding_ratio=0.15)
|
ocr_result = mineru_service.recognize(image_numpy)
|
||||||
layout_detector = LayoutDetector()
|
|
||||||
ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector)
|
|
||||||
image = cv2.imread("test/image.png")
|
|
||||||
ocr_result = ocr_service.recognize(image)
|
|
||||||
print(ocr_result)
|
print(ocr_result)
|
||||||
Reference in New Issue
Block a user