From 35928c2484f3906bb0f97e375c3ea367743f6fc2 Mon Sep 17 00:00:00 2001 From: yogeliu Date: Wed, 31 Dec 2025 17:38:32 +0800 Subject: [PATCH] fix: refact logic --- .dockerignore | 55 ++++++ .gitignore | 2 + Dockerfile | 81 +++++--- app/api/v1/endpoints/convert.py | 28 ++- app/api/v1/endpoints/image.py | 21 +- app/core/config.py | 11 +- app/core/dependencies.py | 19 +- app/main.py | 8 +- app/pkg/reference.docx | Bin 0 -> 14107 bytes app/schemas/convert.py | 2 +- app/schemas/image.py | 5 +- app/services/converter.py | 312 +++++++++++++++++++++++++++++ app/services/docx_converter.py | 335 -------------------------------- app/services/image_processor.py | 2 +- app/services/layout_detector.py | 217 ++++++++++++--------- app/services/ocr_service.py | 280 ++++++-------------------- pyproject.toml | 38 ++-- 17 files changed, 678 insertions(+), 738 deletions(-) create mode 100644 .dockerignore create mode 100644 app/pkg/reference.docx create mode 100644 app/services/converter.py delete mode 100644 app/services/docx_converter.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..c01ecbc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,55 @@ +# Git +.git +.gitignore + +# Python +.venv/ +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg-info/ +.eggs/ +dist/ +build/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +test/ +tests/ + +# Linting & IDE +.ruff_cache/ +.mypy_cache/ +.cursor/ +.vscode/ +.idea/ +*.swp +*.swo + +# Environment +.env +.env.* +!.env.example + +# Documentation (not needed in container) +*.md +!README.md +openspec/ + +# Models (mounted at runtime, not built into image) +app/model/doclayout/*.pdiparams +app/model/DocLayout/ +app/model/PP-DocLayout/ + +# Misc +*.log +*.tmp +.DS_Store +Thumbs.db + +test/ + diff --git a/.gitignore b/.gitignore index e49f677..d9d72c3 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,5 @@ htmlcov/ uv.lock model/ + +test/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 3f3b60c..1586f2b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,54 +1,73 @@ # DocProcesser Dockerfile # Optimized for RTX 5080 GPU deployment -# Use NVIDIA CUDA base image with Python 3.11 +# Use NVIDIA CUDA base image with Python 3.10 FROM nvidia/cuda:12.8.0-runtime-ubuntu24.04 # Set environment variables ENV PYTHONUNBUFFERED=1 \ PYTHONDONTWRITEBYTECODE=1 \ PIP_NO_CACHE_DIR=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + # Model cache directories - mount these at runtime + MODELSCOPE_CACHE=/root/.cache/modelscope \ + HF_HOME=/root/.cache/huggingface \ + # Application config (override defaults for container) + # Use 127.0.0.1 for --network host mode, or override with -e for bridge mode + PP_DOCLAYOUT_MODEL_DIR=/root/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2 \ + PADDLEOCR_VL_URL=http://127.0.0.1:8000/v1 # Set working directory WORKDIR /app -# Install system dependencies +# Install system dependencies and Python 3.10 from deadsnakes PPA RUN apt-get update && apt-get install -y --no-install-recommends \ - python3.11 \ - python3.11-venv \ - python3.11-dev \ - python3-pip \ - libgl1-mesa-glx \ + software-properties-common \ + && add-apt-repository -y ppa:deadsnakes/ppa \ + && apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3.10-venv \ + python3.10-dev \ + python3.10-distutils \ + libgl1 \ libglib2.0-0 \ libsm6 \ libxext6 \ libxrender-dev \ libgomp1 \ curl \ + pandoc \ && rm -rf /var/lib/apt/lists/* \ - && ln -sf /usr/bin/python3.11 /usr/bin/python \ - && ln -sf /usr/bin/python3.11 /usr/bin/python3 + && ln -sf /usr/bin/python3.10 /usr/bin/python \ + && ln -sf /usr/bin/python3.10 /usr/bin/python3 \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 -# Install uv for fast package management -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -ENV PATH="/root/.local/bin:$PATH" - -# Copy dependency files first for better caching -COPY pyproject.toml ./ - -# Create virtual environment and install dependencies -RUN uv venv /app/.venv +# Install uv via pip (more reliable than install script) +RUN python3.10 -m pip install uv -i https://pypi.tuna.tsinghua.edu.cn/simple ENV PATH="/app/.venv/bin:$PATH" ENV VIRTUAL_ENV="/app/.venv" -RUN uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . +# Copy dependency files first for better caching +COPY pyproject.toml ./ +COPY wheels/ ./wheels/ + +# Create virtual environment and install dependencies +RUN uv venv /app/.venv --python python3.10 \ + && uv pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -e . \ + && rm -rf ./wheels # Copy application code COPY app/ ./app/ -# Create model directories (models should be mounted at runtime) -RUN mkdir -p /app/app/model/DocLayout /app/app/model/PP-DocLayout +# Create model cache directories (mount from host at runtime) +RUN mkdir -p /root/.cache/modelscope \ + /root/.cache/huggingface \ + /root/.paddlex \ + /app/app/model/DocLayout \ + /app/app/model/PP-DocLayout + +# Declare volumes for model cache (mount at runtime to avoid re-downloading) +VOLUME ["/root/.cache/modelscope", "/root/.cache/huggingface", "/root/.paddlex"] # Expose port EXPOSE 8053 @@ -60,3 +79,21 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ # Run the application CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8053", "--workers", "1"] +# ============================================================================= +# Usage: Mount local model cache to avoid downloading +# +# Option 1: Use host network (simplest, can access localhost services) +# docker run --gpus all --network host \ +# -v /home/yoge/.paddlex:/root/.paddlex:ro \ +# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \ +# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \ +# doc_processer:latest +# +# Option 2: Use bridge network with host.docker.internal (Linux needs --add-host) +# docker run --gpus all -p 8053:8053 \ +# --add-host=host.docker.internal:host-gateway \ +# -v /home/yoge/.paddlex:/root/.paddlex:ro \ +# -v /home/yoge/.cache/modelscope:/root/.cache/modelscope:ro \ +# -v /home/yoge/.cache/huggingface:/root/.cache/huggingface:ro \ +# doc_processer:latest +# ============================================================================= diff --git a/app/api/v1/endpoints/convert.py b/app/api/v1/endpoints/convert.py index 256c085..ea381fd 100644 --- a/app/api/v1/endpoints/convert.py +++ b/app/api/v1/endpoints/convert.py @@ -3,34 +3,28 @@ from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import Response -from app.core.dependencies import get_docx_converter +from app.core.dependencies import get_converter from app.schemas.convert import MarkdownToDocxRequest -from app.services.docx_converter import DocxConverter +from app.services.converter import Converter router = APIRouter() -@router.post("/docx") +@router.post("/file") async def convert_markdown_to_docx( request: MarkdownToDocxRequest, - converter: DocxConverter = Depends(get_docx_converter), + converter: Converter = Depends(get_converter), ) -> Response: """Convert markdown content to DOCX file. - Returns the generated DOCX file as a binary download. + Returns the generated DOCX file as a binary response. """ try: - docx_bytes = converter.convert(request.markdown) + docx_bytes = converter.export_to_file(request.markdown, export_type="docx") + return Response( + content=docx_bytes, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'}, + ) except Exception as e: raise HTTPException(status_code=500, detail=f"Conversion failed: {e}") - - # Determine filename - filename = request.filename or "output" - if not filename.endswith(".docx"): - filename = f"{filename}.docx" - - return Response( - content=docx_bytes, - media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", - headers={"Content-Disposition": f'attachment; filename="{filename}"'}, - ) diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py index c194213..635ebf7 100644 --- a/app/api/v1/endpoints/image.py +++ b/app/api/v1/endpoints/image.py @@ -28,24 +28,15 @@ async def process_image_ocr( - Otherwise: use PaddleOCR-VL with formula prompt 4. Convert output to LaTeX, Markdown, and MathML formats """ - try: - # 1. Load and preprocess image - image = image_processor.preprocess( - image_url=request.image_url, - image_base64=request.image_base64, - ) - except ValueError as e: - raise HTTPException(status_code=400, detail=str(e)) - try: - # 2. Detect layout - layout_info = layout_detector.detect(image) - except RuntimeError as e: - raise HTTPException(status_code=500, detail=f"Layout detection failed: {e}") + image = image_processor.preprocess( + image_url=request.image_url, + image_base64=request.image_base64, + ) try: # 3. Perform OCR based on layout - ocr_result = ocr_service.recognize(image, layout_info) + ocr_result = ocr_service.recognize(image) except RuntimeError as e: raise HTTPException(status_code=503, detail=str(e)) @@ -54,6 +45,4 @@ async def process_image_ocr( latex=ocr_result.get("latex", ""), markdown=ocr_result.get("markdown", ""), mathml=ocr_result.get("mathml", ""), - layout_info=layout_info, - recognition_mode=ocr_result.get("recognition_mode", ""), ) diff --git a/app/core/config.py b/app/core/config.py index af18a14..c3d81a7 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -5,6 +5,7 @@ from pathlib import Path from pydantic_settings import BaseSettings, SettingsConfigDict import torch +from typing import Optional class Settings(BaseSettings): @@ -21,11 +22,10 @@ class Settings(BaseSettings): debug: bool = False # PaddleOCR-VL Settings - paddleocr_vl_url: str = "http://localhost:8080/v1" + paddleocr_vl_url: str = "http://127.0.0.1:8000/v1" # Model Paths - doclayout_model_path: str = "app/model/DocLayout/best.pt" - pp_doclayout_model_dir: str = "app/model/PP-DocLayout/PP-DocLayoutV2" + pp_doclayout_model_dir: Optional[str] = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV2" # Image Processing max_image_size_mb: int = 10 @@ -37,11 +37,6 @@ class Settings(BaseSettings): host: str = "0.0.0.0" port: int = 8053 - @property - def doclayout_model_file(self) -> Path: - """Get the DocLayout model file path.""" - return Path(self.doclayout_model_path) - @property def pp_doclayout_dir(self) -> Path: """Get the PP-DocLayout model directory path.""" diff --git a/app/core/dependencies.py b/app/core/dependencies.py index dcd04ae..ea19022 100644 --- a/app/core/dependencies.py +++ b/app/core/dependencies.py @@ -3,20 +3,20 @@ from app.services.image_processor import ImageProcessor from app.services.layout_detector import LayoutDetector from app.services.ocr_service import OCRService -from app.services.docx_converter import DocxConverter +from app.services.converter import Converter +from app.core.config import get_settings # Global instances (initialized on startup) _layout_detector: LayoutDetector | None = None -def init_layout_detector(model_path: str) -> None: +def init_layout_detector() -> None: """Initialize the global layout detector. Called during application startup. """ global _layout_detector - _layout_detector = LayoutDetector(model_path=model_path) - _layout_detector.load_model() + _layout_detector = LayoutDetector() def get_layout_detector() -> LayoutDetector: @@ -33,10 +33,15 @@ def get_image_processor() -> ImageProcessor: def get_ocr_service() -> OCRService: """Get an OCR service instance.""" - return OCRService() + return OCRService( + vl_server_url=get_settings().paddleocr_vl_url, + layout_detector=get_layout_detector(), + image_processor=get_image_processor(), + converter=get_converter(), + ) -def get_docx_converter() -> DocxConverter: +def get_converter() -> Converter: """Get a DOCX converter instance.""" - return DocxConverter() + return Converter() diff --git a/app/main.py b/app/main.py index 174b5ae..88d9fe2 100644 --- a/app/main.py +++ b/app/main.py @@ -15,7 +15,7 @@ settings = get_settings() async def lifespan(app: FastAPI): """Application lifespan handler for startup/shutdown.""" # Startup: Load models - init_layout_detector(model_path=settings.doclayout_model_path) + init_layout_detector() yield @@ -37,3 +37,9 @@ app.include_router(api_router, prefix=settings.api_prefix) async def health_check(): """Health check endpoint.""" return {"status": "healthy"} + + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8053) \ No newline at end of file diff --git a/app/pkg/reference.docx b/app/pkg/reference.docx new file mode 100644 index 0000000000000000000000000000000000000000..9f8bdb470eb4bfed772e4a7214db4991506623d2 GIT binary patch literal 14107 zcmb7rb9`pY(rz@dC$=?lGO=yjwrzW2TN4`-+qSKVZ6`Og_c>?pJ>U86@813+@5);B zbahwtTkqmZ}y-DMI2*Q-*uZWLvUPWWylAX#T-OPGd&fn zw#xdZjSIl$)j;elZ(iV9<`UeFNFQ$BnS>vmy1!zMHAX4i0BgiWVyT#|L5`<$v#*si zcdbmA-kG8l+9npd(zQM-KBXJqM;XSiamG&((Hxnips2@YVNY4PwvMmKI>iWJ0H)mTb8%7aGD{sreD9w&r;{CB-K}7>PK?+Rn6NkOR3@hRnfD! z*}4zVFm&_PHXc(4Hednhz-ATSHN4XLig-9Tdejr~af%}fJc0lthz5e|NB=9fjgD!Y zrq!BIS_K1@GIdMp-=Uo}pgU7qb#komMY5!A)+f>=iH&H1)wr}K3EYs`& zZrr4r5UT4DZL75BpS<3_ETR6>4TXl)gkL_~fc)Wx&;M|PqrHQ*{_- z`@QoIwn?T^tFtdhzJXk3XP;3JOE&Pu#S>+~UV1a@OI>aX(!k-Ij|5M+(=4@yAz(E! z`Nxt&dq4vwNjUni=3g9EUlM?aTN*Azid`Y$H2J2TJW(_)m~Ah^t^x!{lLWe(7k=dn zI&CtEIPag9bjues*DQ4wxqa<)$zP;w{1KwlDqz8+1Y!Nu-Nj(-S>-M06qa!^dUwgf zx<>gJfXhpBGMRr-o@=NmE5Ru@kdL~VeX&ie<;v2XjFe}w-Rc#r!xjhuQiE)DyAsSE zd}?I$%@wVMY5~@>>e$wePRSTVe*04|oChK(KIa06Cm5o`C=a8ovrN*ev3`+g-=O~ zA3^Cc3jhG_-y-%7t`>&&zf8ZTDQ>^Qg4k|7_Xd_I)dDtevv~=&OTnbAq1Q);ujD?qHA1lXCn~0HsbA3>#8r_Tf{W8rz zW@kXF27EjC+tk8yT3c;Q+^$utO%Kr>jnIS_)6?$j#{1pbA;(A-KexRPPkM&%dY?(c z{_*we>!|u;Oj?`hyce`KbsVFe{X;U>iLlj^W$pTPtjXzK7YDfii|hNv=B5GBAQ9?! zNsKHgM0eTEe9=Tod#4fC8V%4)nzuprX=;zhjD^sK&=(fr$me2QKWNePaz96sVnGG7 zi;yp(KV|RXqh!gv?`H#-xdVr->_>G>h;*Iq z0KaI9Mqc_#>E$>%H5jA>LJokM+kaUHy`Gh9O6dX=v#YE;D$GNL(5bb+=V|DoIFsHO|f1D|$4}Z!=zaX>fzXVISkN@HV{`;M&hs;9UGb#5Kg$l^` z*ZcE3S)|7XqUmP%==Ty5?gw(0)9_$ScxfhEFFH03UzI@P+oboVF8CI-At>z~#j1;U zmB-74QEeWTZ@A+xWDjVRC|ppEwx`F1EsP>^;Jx3bRvNaqEw??dQERc#7)m6xb?YB4 z)~**Q4SY_$iKX*ui|GV-nYt3`wFbpbF8NOA^SyPagC%%9o3bCMG?dy$c*fA3broW< zOzn7yLwnwcBA?=tZ%A!dvwB6n5er{M({D*H>th6H24J@X-`o1OK=D>jf`H+zoW=pc znOjZ(0@AQu0lLS|I03ed3UPgG?8%^c7WsUBh8h!cps7N2bq%V4!wAju>Co1obo<5B z!6Sv21xD)_P=SCDn&V@+5d8G;(@h_PEWZ&K7VfjX9!#Y7*Ghf6nWX>Pi~A*Fw?BxInMyx6+p0gT2q(3C*@Fj>b6Cf$P-g10_7tel@H zv8s9I_-TU8aa$R%hiZeyG1T|am|yu*TB$6d4lT(wqZ{(waaiUZXa-lN+wiQ#@BKV# z&&>NeD^N~U7W~`bt|s{i7+^j#nbUD#jrs*I27*)y?#byYm@$5fY_Z=jIO9L6P=-wd zHbEMfX<(bvBK~H)zb>|`7(G81-iGMZTS;z09>Pr3yuRzqwx%$s-UC415pgDY3t_Rv za|tpgGC9(ll>@6k^fYisu|kW{&q6vo9iD8t*Mwb_(zF$Hk8csoD50VT9%RY50Yd(> zG?hz+#-I7R8o^UrWhV47A%RZk84KlVhsqYJxF?aNp&DlG3rA|!ca-IPUwP&fH{@=R z>U`eQo$F=k9Olb%x-fUwH%Y{xp}3B!LA}Jn(1?J2{vTRRs?ji&@#fVO{f{RV!G&I^ z-C=OznOC}SiE#nrks4LfUIc7c?+-m$@AfWh9l#~OT5BwmpgvJG>_f=h+I5w_8B zQYa`vDnu}bJn;M@RMnH%LZI(=S^J?-rTL)eS%i=5ChpsQS*s0wdV50Vn74St-f#f# zAe_>emc|AbEoq<@6#6c&P#ld7{B5&$6HM3Ko|R2Zr(Z#G4**g|+nM&(zuZv$G(>(H zBOe2JN(rK-Z1_>b;>+!0pd-m2>+6WDO6D3E7E3Fdls1`}Bt;D7t`v=Rl7bJxESs*X zh)7>mOlY8^Dagc$oYl!^9~9i(F|DIJ(Jiu;>KfDhb%nsuJV@}!soYGLN^!`TlCqdW zwxOyfOn#ui?3(Ws5Y#!(YHZ^}&|COu{sp^O{rVp_UJsTplf(z_sUmlXappZbdVADB zs*2&c)q1{gA6{R4!Wqbd%X#0dABBbm;BhvsF{rdBwjD920lv_gp;Y?9A+?eEg?C^~ ztG_2iJz>@FUb|_sB}lpQWR%ES*ioG>uyS}R23LONHE zckV^-Nd{Z_1?3nKBKI`>`w5MoB#|P2L_wUWGfHnV8|(TQ5SWghmum6~y4zR>ZT(o?`{n7D4|)RF3o`f<~RG z9RvqnjmdLBM7CSy&Cev?Ib3c7ppn_sEo^_#pk}bLpgPQFPUJF(VXHoK#X$7l?arZ_ zKhqyUH+!Nx3T?E9v-Q|$3uRn3l76@9TulmvMTT_w48E3>1d9o&#WH{IA1sf}ffOFp ziLZZAKmy3dP`oL*o68UlxvoIXcKpMaIRW)kYjJCMSK9C(T5ECm-!BFSp<9b39ExN% zHDEqu92R9RDEyfdVqkF~g$yr9=#SPLqjzwsyN?#T4n_ox@sL}KCvt10FkR$bWjoV` zM}F_`mxj>r(T_&~KYf;%+NoK;cy{C`!VB~7*?0h%E%ag%n9Gdc_>`un-0omKtE=H^wA+bs5J%Q zhi&QqenCLU@Np;Sm#z;#NKpM{H6fuYLr>Rx*Lp#AoM(XJ7(17UYrP+d?uS*xVpXV| zD-PNg)KT0o0V@-=&x<%m7iq$%ir|yI2VDoRs396S}iy$z1U8t z+TG9Jt03M0-vD}%I(*vRmf&WRHr8+%uldD1$%Ye82I8^ld?qL>1F5+Hw61sBDj5n` zU9&s;4hGDup8_>gdx`;337OZ@H0iP9Vc~fC&S(X5UieqrI?6n%owGD{8JxGmeYBS{ z-^gNBzN7R630}ooWOrpk_Yq$*v|aY%G<7p>czC~l8qCdK3RYkAq3@Aa)m>n9Z$N)J z3LypT9yRWUiIp68DsT0K&c`F!bAGpAU(LCS_z}pl>g&1W(Fpu3gmcz8Zs>!G0@Y+i zQpnNP8HY#P-I?0h4te>BrZ$wpWw8|VY*!sDH3W>B9lk^87TAkkC=NGdh$5i8|3Q?#+-blIs!ic;Fx^xm^Yv_ zZ$!QO@Py7S|0T5?I_|^Qk~wL$a_WBi)VW~t}X&zj}a3WikRRdkrJzN zOw*_YL&a{mG@ z2y@t4OP?d_YyttzaIR3$UW|`|w~Uz$+tWzRH^E6Gx4}o;oxhz|xSY;v*BHThuybvG z$GW=)c&(5)UJ_m1A9HAoBfyJ&@4 zD7Gc0m2&A=&cIN>~5pJMPy-?9>r17U+YG!sM*rILy^r)_ttVlmZ|uQ=OWT9$lKzs|-ryIwom81-6i$dz6NHj>KDyVur!W7(xw8#4(DMriHPk zvq5BcNM7efc|2w}{Kks(tsJFK>fxK$IDGTvZ$anP5p!G;w&`g0^k-F$#8;(^0wCfI zNZAP*#b~;^Dn-W4bz?Q=)7X%uKVXHsH4>DD9v*&fLK}09?EXl{+9GQKwxHN{cL{$8 z$Bx#_atC=lf>ZbjJ&ttMXO!`6wDlxkugc6-MqjYh-)jr9PJb)}!~16Dg+9Puj!4cB zb5Vu!dRiwU(g+-g4Z9SWo4YHU#Ux*L==6IB4){-&)$PTESy|FLHUjB5aN2fO=)fP? z0$=uW8^CKb<|v8rg%g~Fg~6sdWky8I_x9)?WnML4iyQ%*7-+!H`dsh5L#N6aTz*73 z2Z$|9G;op~VX0!1&|IeX3Zccou0%7Yf*h9`jj$OOt@(_& zCf0HV>&J`WhS6EV=f`vA12l+*64#tdg(ta!Ixb0suV0YO5W2g5n1JdzOQE)Hl!4w+Nc!e-dW1#fr`xob{afEWg* z9*gPvn6Uri$VAvzIpT zV4i}UC4DioJ1HqRhW0!8$i=qOBiB2rsn;Uq5qWAG7$u57n6=c(6BKk@1f8hpb=8V9 z@7IY$acb>lkBv>{ms@F-eH3k*Hscpe>;VP08ZEB|r`yy_m(#JSP`FK2t#jvX8o61O zMb;7|w7YfxA8*b3xSg z-lj986d^-#v+5jGG=Q1S2b!Jcg~%YWQ5I`cpuF8Bs8D_450?66_9lTV{Tt9>FXt-?8!PaJI)go0;p7QBIA5y16>v2M(Jtgvjza zHiwjYiA=NPzG8QUpZ#ezBTIg)&&Wm&9t&s3V@bf?HN=c+&xymt8;(nfA?;baaIRzXL?|!-fE@UrRNx;xiM^qNgQ=DAA2djYlDhQ@6OtEh)w|BYTB2dg z5Z)}g9=jEDZw5l4dtnh_C@l?;sH(l!E3Dv@*cK&KYEtz6RmREyq4<>vR#COz>Q zLPG`^5j=7U464-MttVD4ai<1V2<*EP0t_8MKYKPmzcxf!nsur%G7tttlOQ{wE6&fG zVqvQJ;u+mWNZ6 z*^4U5(n7v=Z02~ARNSwUErUnbe?v3hO!kj0(5r(m*;bya>ZR~WU;naA zPdL#u_cb{Npt7cDteH+J6sG#8-$MOqUinzSFjuIuy!akn3231Sk;#)Wc>)Q#uv`5Q zK{LNqI~lQ%rJ8>Mixtt`DM45UfwFeO8m+v&-Fh6NQVFk>IVk9uG)SpfD}D&{RnL+N zygG3?dTZ9#oPi8B-e^~;)4CcS>@0W)qS#Eoon`veyPO<6n1%@=b``~7n}*u#`Mf?; z&w0xCTt@e<>;POdK3TNzeddM!cc1$3cKxGo*tE?2*wEmp z7&P^l)h>OO>f9ruGzn2XF(LEIdX-M=(t<5|KX3X?F*O3@%mHFPwtY2IZ4WoH7TcG? zXgIu|^crZyi|{?|A!IG>chc`YWN4vL#u&WR+HC8 zfJG?57^BXE`Eu@!8)JX*a%9$N5&UBFyw^{B1S`FmrNe&u?lC8Y$@uNm$+M-ctz*6n zSE+RA4qxD~TJY#D?zoAYRVpfA!bDP_03H2w6|{#WgmGp7IgU3AFcw|Q999w~ey3U) zjhAJE0M#$|dz^@U-VA6MY@&YB2w+3Qq_vnm*;}(tlQ8sCt&-&F_qx9R8FuMU+; zc@oRQBz%wsL$F=^+n{dw-OBIxqU=>JCX#7tFE+l%V}$5h*n^i1{%N@I`sXk`innqR z3)&gXkuk^q&+A>0vvuC3swtF&XVc>8X?>Vy$@W~Oj%=;0n$GyM$YJxZUFZ*l zYPZ3>OmH=ics=E3rzrj>{L!9h2F$!c8*5DE|4OVh0|TfieelM%U;qG!|0XYutgRiatR4R3FUu8IthQ<4J+)2W@vqpM z5^5|2iWJDctSgD%1Bcso60{F8HNLz`Rq%YqUp|B4y-Q&*9ycI~D`d`e7cPN`BM^+i zC5VC~V3x3uzqxm$LnE9}p=4r7@+&wHfN@LpdS|i~CCNvgmfHagrVW)R5@a`5L<9*s zAsgu54>vJ2F!CgD#j6cRz+fq6=EV}yeUvs8EuxFI3FPE%0esgN*>EWgZO z2gN#m<%RLl)E$|pG`<5_-73n7A!jp*ne_uL&n6%$2APW@;d3rI7*ZD;CuX&wut=es zWCVkeOU9Ppr{@|0B|gIt9Z(m;S5-ShM^gd_|eU!d=RKOr7t`rS-5>=*+v9z?U?UX7bd1Q25EHk1^7yQ#GkFU zy0V(8uYH$7=oxspd&-?5{t=$n_)eL<%;bx3%2Jog(Ej3f;sWbFyP*fck)R!&=M+DH z@;=@NsYcnrXVyBN!-NXMiOT8L=aYTdcXg$qCk+W7L{V0qfYF7pl1D~uTLMtJ(d>d2 z2r&w}T1kF_2kvKih?uSl$xbNH<$(BOlOh+tykfHqAiiVl?gW`XuSx=I*s{iq1w$9^ zoma)$uSF2$E+7HY#xWUboN9{_a-df5rPir}Ub6gzyijKbVc+LrYA3cL`Uw(!;A>60 z<;xR)tx}=5SkkG3tan(e@{`ePw}E=60b{>HU)gveT7e(>1(^0ouU6%3_5se|N6F;TjrbZ<)6mB43y_L)6QM;K0XuFj` zf|&$>aXXBn+su~q76@LQ3yUveVe$YETSTR~AG@$MdI-|kXuT!MS%=v3tNL-gV5Q&* zR|p6tisASQ*7G{C+)hd)#xI~6*761o^*(=Bn6)aPPTvo4*U5#fUUWPlXQZAG2Cjn^ zib>f+-ipbPi3btIQ4)Za;kOZ>bTnsH8;ckFq!or}QUwVslZDSZ}+Z?qR@cO4>E6L$|{=@6bfrd>ED`XcvO`8G;>sa=& z^HqKk8FVYJV*lqkr3a*~&1gT|Pp-_yk#!^AR2UlJo5}%ZU%@cAf!b zc*2N8Iax<8&_2oZv8NbxA0x!bGRn_nQ-z1$Kg!@K1JWM*)_j&YYc2Y`i^he>u+p6xyJ+v38VrCF^$TM$hSjKPF>r|wMrR9XEsd)l07jQ3wteA(Mv8KgOxHZp$N(8^ zMol={V~Wv4x-b9P88_$i!m2x2UCnBiQFdddvx?6xcuHNKV`tk9(C|7hA6i{w)XfB+!<9p z!X~26NfU&<$)P9JLt5dnqk-7bj7-&8-yq`2uFmKV{csuFgNuv@fyEXIp7FbsnrW2$ z!Lsd0@&$d1nB~s~cP&TDp0Gy`&~&N3;$0@%krK7>^xMBL5N_bb?sK=>(6-|YIkDGo zdwQ`}uW$7>slz4`xnWzWCf}pM4Fy+@a574s>n=i;jvaod^o}=t_of*+zPrBtcFE_> zn>t5F%WZSK<6Luh!$}ek`!jr-Fb8fEs9CBJ&VN{HgTvMpb42e!{?s73vs)0$6f=t9 zkvQfOPY4}r-MrlnF|^x}s5Km-k?}~k&^oYi=4W!?V}7lYJnE&YQi%FLU#P5-`LudB??+cIBSbC1>WxSDRv+8jbb>~7T;Z@*R`|m;HOzMd}x!f z==AN})d=IHf_ccEQmH?lN7YX@yf3&ItRl@B1{^p$JgKJ4LJE1;!ZHsUU(&7X9_PA- zV0|g&c;@3CXI3N(+F{5COGa%v>4yfpCe%@zrNHF7G5Zd%*k+t8NbO+ z$x6^oY&B=aJKV^l1Md{8Fo4X0i%x+H$CyO;e}XalHbiL_c3A@b zWsF+rbe&Fn^V95lp(t9(MtX~8nEDc7`b^hm0vq=tl`<7{#Qwe*w(08z)HN=&#SL2T zcd#tGLWM95JVuje?tgAYGQ#rB>ONj)cDeqx75O+<;%I4TL9)!HQnqGEJ zv)*rdY}@g>@cSVWcm!f6qD*ZF&&ehCXH0QUY{Rcau|Uy>h7zTK5T%ElFhO11h?a~> z$V>HclKD!ZGU!Y(UQ1qaZ!SMmyp!KsPdG!e&X7+-QqT)El zwZ6g%u{vJED|q1zMFvgx__e`zF1E)IuI;Sh^Qlgye04BYyMw4O99Rh8sJ?jMkrm>V z0EZk;ZUyP1C3=4p<3Qp_ID%Fy+(}>O3S#V*5LY`GewS89TqQ zYG9BXZ1I8%Ywm^*v*L*KQ0O!>dGaN~x#fi%=AP%CsHKy^LD;!2MbEj`XOZQ{CJAGp zIr@Ci2|BH|CgOKSYt!XS`eTxuu?ixtedObGBeHDNhrn{Tk-6dHl$$z)PLs4#T`E$e zXHFpt(R)EEMf61vn$>hk8g69Ggjo#s#2VzsQTnC$rSR@}-I$ydP?K@9d>5gKUWOnQ zQUxx_tyvNU-7G?%l8nG%svp=TIZYKMi(<~JSXxfhlxe04W$4S zpv3dfZik1JX7d#-9P7{ZuU1`Uo|Cg7H!k!j9F4`C;z~l;2(ypU^M@sW_?OgP8Gkd} z0F6{xhcR}3qA#%-e3(}jTto+)TT(4q6<8@wt>=_~&7aGlFx4ZcH7Hr~VJ+{bESzR$ z`MG3BwkB=XxLmB#47C(_>@45J=lU+201jF;)+6<}?-~Zg5+=rwzH@>;(tv|CA{zC; zo*^G@m!Q(}?Lh%n4;$`u(ZC_5JZ|p+nUB@s#I4 zdnC@|1-s8QKy^v@vhC)a)A_qtCX1Noj+DP#JFs@-QVwTd28cNw@8`p_Yz6-|E80Fv z$hg9G7g``mHoPr^nIU5xB9iGa_*D3?B!*FC zToRBn!;s9!nKa3hj=l82jY!8d4>I3?Yz7C>JMZ5?l-cQ}{lDz0g0jiTt!W{RbxLZ9 zD_OU=9YtnY&-Yohe?NMgI0(d2;axC?T8UO_KxGN@;^p8h02y9ccPNTQ+M@albXo+d z?S;Bs@@&qR1629^O1XOK4$^th<)e^6{#FKhVUA+8n0>a{j3+x4W|^>+f4aiw;}XTk z<}BA>yb^ipUP@p$<+B4h+z$k2#*)*X;;DMrJCHSpB`o6rj_gE4AaggO+m-eBE*7-= zBp@K=Msd6p(X9z2ZLb<=0P5@b!=gn?!z87??#IYe&%7jS`OV%tC?BQ2LNvAx6`F}! ziL81!j2V4EH}BpSRac@t1*Ku~U2b!xUdxl1(OS#$$a`%7ibc*%W$j(}XaRVIHDC0h zOTiK9S4~0il9moo{Nc-&O#8FaXJ)f}ZY2cIuv#ra5a(=_5IRayh%@!NCV>Wr&myeK z4d#;%-@hzKv(7>Jo28>s+ps#3%T+h)tGVjd*Dmu(`AYlb_iP|PeO zHX*f^?C~cSwtOkHzF&nG-~SRDg@1X;B*9m2=Gv?30rd&9oD2?5NgYGpY{-b`@lu5-CK`9yRV;YmA6mf2vz*k zP8D~Lmw~F)z@kkmu(vPtVhcqUDomFSD)(ss-1hxpL&tBlgWwcS@~Y`C+2ak8JssaAjln=j0j+=*n5p*MHW z3?WS>s2-zF?;b+fK@xo!8o&{JnPOZCLm!JK0*-}ez31DoZfwGAk3?m{h(W`xaS^kn zxOI!#0pX}okPsn(w61k*9Ye!4O7BwvZ!}It^G1qs*dhs0!jeU_Cs`D+>KSt}63(>y zGy<&c{XO(}vhj6*#SjYyvC|4g;LHm)ox!{?#P1R?M}_77Vzg<3Ozb0FpD(cqu!r?M zZ|4c$1LG~S<5>6twYy>PHonlsud&mPZi;yiA(toxRjK0j=n*~UGOr4kP-?b+6oa?l zMvns2Yamc~o4KXDz?_nIz|vPRVimEwU=+aZK#?8lTjpnTZa1;X^HNqFK<-D1Ht7i& zlTR`CG+F6wo)x}4p{-zKU3M_N!S6U(ZIAaSH@IJSZ)e!y#^y_HkWtC8%Wdi~m>MS) zYF4e$%kgn?ObNzP=vDd@6Ll0R#j7A6j=KEv(|m8?MM`@q%6gZm>T5XEVlH~#KO-8} zluM^kSFu*-!asp|ea`!CV8HeD3c`Xc1RbF}y%8@_r&IRhp{RJYEn(!$nb%tFCxyKv#|EZ_JWIi9s6wll|9f)dq#AM!cPJA{ZvIY z?y-Mr5U&nJNsT+3|BKSHbX9{c^|?`O%B%wZoSO)Cr{Y4$$_93^xT@pVym4uqtPU?@ zLPf3HRyI2MBR=QLY;oZ)l@+88sLGFO5Q(p4xXs?ab-bA~7!I(ZgNTd`2q#G>lB7UQ zJvRnHTH~7ZnIf1v3uPkmlU+E}KPYAx#I9#ZSQ<=>L)cb$sSCRP%2DOFGuH?l1stq@*~@utjP zWNBneZs@Qz6mD#fOcLg9kw8k@u;Q)ewX9^n&CjJMAM8?)kAeof71u%C)}tRSkT~A= zo5@vbbzg}C7@C}6Oxf7-ww^<-Y5AegqW7C6Wd@$|t@S%X=7iI%AIbocjetg2|cu%kB~ z_pl#yvv=@!$%(hTfJ{_`@oR5R;_K>3H#^B8iaB589BE~0E^9tS*scMV2 z74{$Ha}KyST`~Ki$aZC9A#pDOXuO}W5I{h;kqjU2hg&XdH1!$HyWVM9^q64DV@ zVo#W5bX_SY9yRH}esJKLc)TcZI2c?7jdCFNvj{ufW(hn+vo^YB!i}NEVtJ14!Tj%* z{15Z@-^`ii$J22AVMeNtBoo%h=N}2CKL!83xBHiUrNw(`;rq`0KJ2?zp!tqhbEqY- zSUL*_KO?Fx!GMq6=jG0Cn-3-4c7y2h`D#5XX$iMon6#a?z_-<}gp4IhUG{>h9}FKvQO4Nu{w@7 z4Q?%GHDT93TpX@wf}^im5kBgaGsFmz_0ik9#)uF`F%$k`1&S12^AnX!nOcN9Do-UmHt{BiNv z4+-&r9(jC>xdp&_m7efGfDlMs0E3hsX@Sa1#RGgI&UoFoqhJjfV8^{eg$nB8#|GG* zKxM=Mfsj68`@id1es|Ev0{}n(ypO+1bpO>|e|GTS&!zv0K!3Jhr7`u?|Bf}kEB!qp z{lfgIbooD+KfXWzH)X$ybpC9=l>KUdE7|$yB!4g3`6c|PKGMI#{#C;B`)GeJF8K@H z_OY`6qx9dUCcndfFN*jJF7OdS{(}FfMB@K8>0c-R|7^b|o&E9R`L8;FUlacCPWctg z{%n8PiS{26>-WL_9^8Hb|FjhOKY{-v)cxMm@8s8CJ;@XNYY2bu<9GP)@6~_7nJNAS zH~I)A|9Q*)9shee@h`m3N5@9^JCG5&)0{M%=L z6=wVi{(tcv=*JJvAG-gYlKSU5|2--7msXm80{==;$%udY<6DdHQ8xwuF|=FyU!ea3 DG ConvertResult: + """Convert markdown to LaTeX and MathML formats. + + Args: + md_text: Markdown text to convert. + + Returns: + ConvertResult with latex and mathml fields. + + Raises: + ValueError: If md_text is empty. + RuntimeError: If conversion fails. + """ + if md_text == "": + return ConvertResult(latex="", mathml="") + + try: + # Convert to LaTeX + latex_output = pypandoc.convert_text( + md_text, + "latex", + format=self.INPUT_FORMAT, + ).rstrip("\n") + + # Convert to HTML with MathML + mathml_output = pypandoc.convert_text( + md_text, + "html", + format=self.INPUT_FORMAT, + extra_args=["--mathml"], + ).rstrip("\n") + + return ConvertResult(latex=latex_output, mathml=mathml_output) + + except Exception as e: + raise RuntimeError(f"Conversion failed: {e}") from e + + def preprocess_for_export(self, md_text: str) -> str: + """Preprocess markdown text for export to docx/pdf. + + Handles LaTeX formula formatting, matrix environments, and + other transformations needed for proper Word/PDF rendering. + + Args: + md_text: Raw markdown text. + + Returns: + Preprocessed markdown text. + """ + # Replace \[1mm] => \vspace{1mm} + md_text = re.sub(r"\\\[1mm\]", r"\\vspace{1mm}", md_text) + + # Add blank lines around \[...\] block formulas + md_text = re.sub( + r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", + r"\1\n\n\\[\3\\]\n\n\4", + md_text, + flags=re.DOTALL, + ) + md_text = re.sub( + r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", + r"\n\\[\2\\]\n", + md_text, + flags=re.MULTILINE | re.DOTALL, + ) + + # Remove arithmatex span wrappers + cleaned_md = re.sub(r'(.*?)', r"\1", md_text) + + # Convert inline formulas: \( \) => $ $ + cleaned_md = re.sub(r"\\\(", r"$", cleaned_md) + cleaned_md = re.sub(r"\\\)", r"$", cleaned_md) + + # Convert block formulas: \[ \] => $$ $$ + cleaned_md = re.sub(r"\\\[", r"$$", cleaned_md) + cleaned_md = re.sub(r"\\\]", r"$$", cleaned_md) + + # Remove spaces between $ and formula content + # Use negative lookahead/lookbehind to avoid matching $$ block formulas + cleaned_md = re.sub(r"(? str: + """Convert vmatrix/Vmatrix to left/right delimited forms. + + This fixes the vertical line height issues in Word. + """ + # vmatrix -> \left| \begin{matrix}...\end{matrix} \right| + md_text = re.sub( + r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", + r"\\left| \\begin{matrix}\1\\end{matrix} \\right|", + md_text, + flags=re.DOTALL, + ) + + # Vmatrix -> \left\| \begin{matrix}...\end{matrix} \right\| + md_text = re.sub( + r"\\begin\{Vmatrix\}(.*?)\\end\{Vmatrix\}", + r"\\left\\| \\begin{matrix}\1\\end{matrix} \\right\\|", + md_text, + flags=re.DOTALL, + ) + + return md_text + + def _fix_brace_spacing(self, md_text: str) -> str: + """Fix spacing issues with braces in equation systems. + + Removes whitespace and adds negative space for proper alignment in Word/OMML. + """ + # Fix \left\{ spacing + md_text = re.sub( + r"\\left\\\{\s+", + r"\\left\\{\\!", + md_text, + ) + + # Fix \right\} spacing + md_text = re.sub( + r"\s+\\right\\\}", + r"\\!\\right\\}", + md_text, + ) + + return md_text + + def _convert_special_environments(self, md_text: str) -> str: + """Convert cases and aligned environments to array format. + + These environments have better rendering support in Word/OMML. + """ + + def convert_cases(match: re.Match) -> str: + content = match.group(1) + return r"\left\{\begin{array}{ll}" + content + r"\end{array}\right." + + md_text = re.sub( + r"\\begin\{cases\}(.*?)\\end\{cases\}", + convert_cases, + md_text, + flags=re.DOTALL, + ) + + def convert_aligned_to_array(match: re.Match) -> str: + content = match.group(1) + # Remove leading & alignment markers (not needed in array{l}) + content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + return r"\left\{\begin{array}{l}" + content + r"\end{array}\right." + + md_text = re.sub( + r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", + convert_aligned_to_array, + md_text, + flags=re.DOTALL, + ) + + def convert_standalone_aligned(match: re.Match) -> str: + content = match.group(1) + content = re.sub(r"(^|\\\\)\s*&", r"\1", content) + return r"\begin{array}{l}" + content + r"\end{array}" + + md_text = re.sub( + r"\\begin\{aligned\}(.*?)\\end\{aligned\}", + convert_standalone_aligned, + md_text, + flags=re.DOTALL, + ) + + return md_text + + def export_to_file(self, md_text: str, export_type: ExportType = "docx") -> bytes: + """Export markdown to docx or pdf file. + + Args: + md_text: Markdown text to export. + export_type: Export format, either 'docx' or 'pdf'. + + Returns: + bytes of the exported file. + + Raises: + ValueError: If export_type is not supported. + RuntimeError: If export fails. + + """ + + # Preprocess markdown + cleaned_md = self.preprocess_for_export(md_text) + + # Create temp file for input + with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as f_in: + f_in.write(cleaned_md.encode("utf-8")) + md_path = f_in.name + + output_file = md_path + "." + export_type + + try: + if export_type == "docx": + self._export_docx(md_path, output_file) + with open(output_file, "rb") as f: + return f.read() + else: # pdf + self._export_pdf(md_path, output_file) + with open(output_file, "rb") as f: + return f.read() + + except Exception as e: + # Cleanup on error + self._cleanup_files(md_path, output_file) + raise RuntimeError(f"Export failed: {e}") from e + finally: + # Always cleanup input file + if os.path.exists(md_path): + os.remove(md_path) + + def _export_docx(self, input_path: str, output_path: str) -> None: + """Export to DOCX format using pypandoc.""" + extra_args = [ + "--highlight-style=pygments", + f"--reference-doc=app/pkg/reference.docx", + ] + pypandoc.convert_file( + input_path, + "docx", + format=self.INPUT_FORMAT, + outputfile=output_path, + extra_args=extra_args, + ) + + def _export_pdf(self, input_path: str, output_path: str) -> None: + """Export to PDF format using pypandoc with XeLaTeX.""" + extra_args = [ + "--pdf-engine=xelatex", + "-V", + "mainfont=Noto Sans CJK SC", + "--highlight-style=pygments", + ] + pypandoc.convert_file( + input_path, + "pdf", + format=self.INPUT_FORMAT, + outputfile=output_path, + extra_args=extra_args, + ) + + def _cleanup_files(self, *paths: str) -> None: + """Remove files if they exist.""" + for path in paths: + if os.path.exists(path): + os.remove(path) + + def cleanup_export_file(self, file_path: str) -> None: + """Cleanup exported file after sending response. + + Call this after sending the file to the client. + + Args: + file_path: Path to the exported file. + """ + if os.path.exists(file_path): + os.remove(file_path) + diff --git a/app/services/docx_converter.py b/app/services/docx_converter.py deleted file mode 100644 index 6364507..0000000 --- a/app/services/docx_converter.py +++ /dev/null @@ -1,335 +0,0 @@ -"""Markdown to DOCX conversion service. - -Reference implementation based on https://github.com/YogeLiu/markdown_2_docx -""" - -import io -import re -from dataclasses import dataclass - -from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH -from docx.oxml import OxmlElement -from docx.oxml.ns import qn -from docx.shared import Inches, Pt - - -@dataclass -class MarkdownElement: - """Parsed markdown element.""" - - type: str # heading, paragraph, list_item, code_block, table, math - content: str - level: int = 0 # For headings and lists - language: str = "" # For code blocks - - -class DocxConverter: - """Converts markdown content to DOCX format.""" - - def __init__(self): - """Initialize the converter.""" - self.heading_pattern = re.compile(r"^(#{1,6})\s+(.+)$") - self.list_pattern = re.compile(r"^(\s*)[-*+]\s+(.+)$") - self.ordered_list_pattern = re.compile(r"^(\s*)\d+\.\s+(.+)$") - self.code_block_pattern = re.compile(r"^```(\w*)$") - self.inline_code_pattern = re.compile(r"`([^`]+)`") - self.bold_pattern = re.compile(r"\*\*([^*]+)\*\*") - self.italic_pattern = re.compile(r"\*([^*]+)\*") - self.math_block_pattern = re.compile(r"\$\$(.+?)\$\$", re.DOTALL) - self.inline_math_pattern = re.compile(r"\$([^$]+)\$") - - def convert(self, markdown: str) -> bytes: - """Convert markdown content to DOCX. - - Args: - markdown: Markdown content to convert. - - Returns: - DOCX file as bytes. - """ - doc = Document() - elements = self._parse_markdown(markdown) - - for element in elements: - self._add_element_to_doc(doc, element) - - # Save to bytes - buffer = io.BytesIO() - doc.save(buffer) - buffer.seek(0) - return buffer.getvalue() - - def _parse_markdown(self, markdown: str) -> list[MarkdownElement]: - """Parse markdown into elements. - - Args: - markdown: Markdown content. - - Returns: - List of parsed elements. - """ - elements: list[MarkdownElement] = [] - lines = markdown.split("\n") - i = 0 - in_code_block = False - code_content = [] - code_language = "" - - while i < len(lines): - line = lines[i] - - # Code block handling - code_match = self.code_block_pattern.match(line) - if code_match: - if in_code_block: - elements.append( - MarkdownElement( - type="code_block", - content="\n".join(code_content), - language=code_language, - ) - ) - code_content = [] - in_code_block = False - else: - in_code_block = True - code_language = code_match.group(1) - i += 1 - continue - - if in_code_block: - code_content.append(line) - i += 1 - continue - - # Math block ($$...$$) - if line.strip().startswith("$$"): - math_content = [] - if line.strip() == "$$": - i += 1 - while i < len(lines) and lines[i].strip() != "$$": - math_content.append(lines[i]) - i += 1 - else: - # Single line $$...$$ or start - content = line.strip()[2:] - if content.endswith("$$"): - math_content.append(content[:-2]) - else: - math_content.append(content) - i += 1 - while i < len(lines): - if lines[i].strip().endswith("$$"): - math_content.append(lines[i].strip()[:-2]) - break - math_content.append(lines[i]) - i += 1 - - elements.append( - MarkdownElement(type="math", content="\n".join(math_content)) - ) - i += 1 - continue - - # Heading - heading_match = self.heading_pattern.match(line) - if heading_match: - level = len(heading_match.group(1)) - content = heading_match.group(2) - elements.append( - MarkdownElement(type="heading", content=content, level=level) - ) - i += 1 - continue - - # Unordered list - list_match = self.list_pattern.match(line) - if list_match: - indent = len(list_match.group(1)) - content = list_match.group(2) - elements.append( - MarkdownElement(type="list_item", content=content, level=indent // 2) - ) - i += 1 - continue - - # Ordered list - ordered_match = self.ordered_list_pattern.match(line) - if ordered_match: - indent = len(ordered_match.group(1)) - content = ordered_match.group(2) - elements.append( - MarkdownElement( - type="ordered_list_item", content=content, level=indent // 2 - ) - ) - i += 1 - continue - - # Table (simple detection) - if "|" in line and i + 1 < len(lines) and "---" in lines[i + 1]: - table_lines = [line] - i += 1 - while i < len(lines) and "|" in lines[i]: - table_lines.append(lines[i]) - i += 1 - elements.append( - MarkdownElement(type="table", content="\n".join(table_lines)) - ) - continue - - # Regular paragraph - if line.strip(): - elements.append(MarkdownElement(type="paragraph", content=line)) - - i += 1 - - return elements - - def _add_element_to_doc(self, doc: Document, element: MarkdownElement) -> None: - """Add a markdown element to the document. - - Args: - doc: Word document. - element: Parsed markdown element. - """ - if element.type == "heading": - self._add_heading(doc, element.content, element.level) - elif element.type == "paragraph": - self._add_paragraph(doc, element.content) - elif element.type == "list_item": - self._add_list_item(doc, element.content, element.level, ordered=False) - elif element.type == "ordered_list_item": - self._add_list_item(doc, element.content, element.level, ordered=True) - elif element.type == "code_block": - self._add_code_block(doc, element.content) - elif element.type == "table": - self._add_table(doc, element.content) - elif element.type == "math": - self._add_math(doc, element.content) - - def _add_heading(self, doc: Document, content: str, level: int) -> None: - """Add a heading to the document.""" - # Map markdown levels to Word heading styles - heading_level = min(level, 9) # Word supports up to Heading 9 - doc.add_heading(content, level=heading_level) - - def _add_paragraph(self, doc: Document, content: str) -> None: - """Add a paragraph with inline formatting.""" - para = doc.add_paragraph() - self._add_formatted_text(para, content) - - def _add_formatted_text(self, para, content: str) -> None: - """Add text with inline formatting (bold, italic, code).""" - # Simple approach: process inline patterns - remaining = content - - while remaining: - # Find next formatting marker - bold_match = self.bold_pattern.search(remaining) - italic_match = self.italic_pattern.search(remaining) - code_match = self.inline_code_pattern.search(remaining) - math_match = self.inline_math_pattern.search(remaining) - - matches = [ - (bold_match, "bold"), - (italic_match, "italic"), - (code_match, "code"), - (math_match, "math"), - ] - matches = [(m, t) for m, t in matches if m] - - if not matches: - para.add_run(remaining) - break - - # Find earliest match - earliest = min(matches, key=lambda x: x[0].start()) - match, match_type = earliest - - # Add text before match - if match.start() > 0: - para.add_run(remaining[: match.start()]) - - # Add formatted text - run = para.add_run(match.group(1)) - if match_type == "bold": - run.bold = True - elif match_type == "italic": - run.italic = True - elif match_type == "code": - run.font.name = "Courier New" - run.font.size = Pt(10) - elif match_type == "math": - run.italic = True - - remaining = remaining[match.end() :] - - def _add_list_item( - self, doc: Document, content: str, level: int, ordered: bool - ) -> None: - """Add a list item.""" - para = doc.add_paragraph(style="List Bullet" if not ordered else "List Number") - para.paragraph_format.left_indent = Inches(0.25 * level) - self._add_formatted_text(para, content) - - def _add_code_block(self, doc: Document, content: str) -> None: - """Add a code block.""" - para = doc.add_paragraph() - para.paragraph_format.left_indent = Inches(0.5) - - run = para.add_run(content) - run.font.name = "Courier New" - run.font.size = Pt(9) - - # Add shading - shading = OxmlElement("w:shd") - shading.set(qn("w:val"), "clear") - shading.set(qn("w:fill"), "F0F0F0") - para._p.get_or_add_pPr().append(shading) - - def _add_table(self, doc: Document, content: str) -> None: - """Add a table from markdown table format.""" - lines = [l.strip() for l in content.split("\n") if l.strip()] - if len(lines) < 2: - return - - # Parse header - header = [c.strip() for c in lines[0].split("|") if c.strip()] - - # Skip separator line - data_lines = lines[2:] if len(lines) > 2 else [] - - # Create table - table = doc.add_table(rows=1, cols=len(header)) - table.style = "Table Grid" - - # Add header - header_cells = table.rows[0].cells - for i, text in enumerate(header): - header_cells[i].text = text - header_cells[i].paragraphs[0].runs[0].bold = True - - # Add data rows - for line in data_lines: - cells = [c.strip() for c in line.split("|") if c.strip()] - row_cells = table.add_row().cells - for i, text in enumerate(cells): - if i < len(row_cells): - row_cells[i].text = text - - def _add_math(self, doc: Document, content: str) -> None: - """Add a math block. - - For proper OMML rendering, this would need more complex conversion. - Currently renders as italic text with the LaTeX source. - """ - para = doc.add_paragraph() - para.alignment = WD_ALIGN_PARAGRAPH.CENTER - - run = para.add_run(content) - run.italic = True - run.font.name = "Cambria Math" - run.font.size = Pt(12) - diff --git a/app/services/image_processor.py b/app/services/image_processor.py index 34a6419..d7abed1 100644 --- a/app/services/image_processor.py +++ b/app/services/image_processor.py @@ -116,7 +116,7 @@ class ImageProcessor: else: raise ValueError("Either image_url or image_base64 must be provided") - return self.add_padding(image) + return image def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str: """Convert numpy image to base64 string. diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py index b7ed407..3cd8446 100644 --- a/app/services/layout_detector.py +++ b/app/services/layout_detector.py @@ -1,122 +1,157 @@ -"""DocLayout-YOLO wrapper for document layout detection.""" +"""PP-DocLayoutV2 wrapper for document layout detection.""" import numpy as np from app.schemas.image import LayoutInfo, LayoutRegion from app.core.config import get_settings +from paddleocr import LayoutDetection +from typing import Optional settings = get_settings() class LayoutDetector: - """Wrapper for DocLayout-YOLO model.""" + """Layout detector for PP-DocLayoutV2.""" - # Class names from DocLayout-YOLO - CLASS_NAMES = { - 0: "title", - 1: "plain_text", - 2: "abandon", - 3: "figure", - 4: "figure_caption", - 5: "table", - 6: "table_caption", - 7: "table_footnote", - 8: "isolate_formula", - 9: "formula_caption", + _layout_detector: Optional[LayoutDetection] = None + + # PP-DocLayoutV2 class ID to label mapping + CLS_ID_TO_LABEL: dict[int, str] = { + 0: "abstract", + 1: "algorithm", + 2: "aside_text", + 3: "chart", + 4: "content", + 5: "display_formula", + 6: "doc_title", + 7: "figure_title", + 8: "footer", + 9: "footer_image", + 10: "footnote", + 11: "formula_number", + 12: "header", + 13: "header_image", + 14: "image", + 15: "inline_formula", + 16: "number", + 17: "paragraph_title", + 18: "reference", + 19: "reference_content", + 20: "seal", + 21: "table", + 22: "text", + 23: "vertical_text", + 24: "vision_footnote", } - # Classes considered as plain text - PLAIN_TEXT_CLASSES = {"title", "plain_text", "figure_caption", "table_caption", "table_footnote"} + # Mapping from raw labels to normalized region types + LABEL_TO_TYPE: dict[str, str] = { + # Text types + "abstract": "text", + "algorithm": "text", + "aside_text": "text", + "content": "text", + "doc_title": "text", + "footer": "text", + "footnote": "text", + "header": "text", + "number": "text", + "paragraph_title": "text", + "reference": "text", + "reference_content": "text", + "text": "text", + "vertical_text": "text", + "vision_footnote": "text", + # Formula types + "display_formula": "formula", + "inline_formula": "formula", + "formula_number": "formula", + # Table types + "table": "table", + # Figure types + "chart": "figure", + "figure_title": "figure", + "footer_image": "figure", + "header_image": "figure", + "image": "figure", + "seal": "figure", + } - # Classes considered as formula - FORMULA_CLASSES = {"isolate_formula", "formula_caption"} - - def __init__(self, model_path: str, confidence_threshold: float = 0.2): - """Initialize the layout detector. + def __init__(self): + """Initialize layout detector. Args: - model_path: Path to the DocLayout-YOLO model weights. - confidence_threshold: Minimum confidence for detections. """ - self.model_path = model_path - self.confidence_threshold = confidence_threshold - self.model = None + _ = self._get_layout_detector() - def load_model(self) -> None: - """Load the DocLayout-YOLO model. + def _get_layout_detector(self): + """Get or create LayoutDetection instance.""" + if LayoutDetector._layout_detector is None: + LayoutDetector._layout_detector = LayoutDetection(model_name="PP-DocLayoutV2") + return LayoutDetector._layout_detector - Raises: - RuntimeError: If model cannot be loaded. - """ - try: - from doclayout_yolo import YOLOv10 - - self.model = YOLOv10(self.model_path) - except Exception as e: - raise RuntimeError(f"Failed to load DocLayout-YOLO model: {e}") from e - - def detect(self, image: np.ndarray, image_size: int = 1024) -> LayoutInfo: - """Detect document layout regions. + def detect(self, image: np.ndarray) -> LayoutInfo: + """Detect layout of the image using PP-DocLayoutV2. Args: - image: Input image as numpy array in BGR format. - image_size: Image size for prediction. + image: Input image as numpy array. Returns: - LayoutInfo with detected regions. - - Raises: - RuntimeError: If model not loaded. + LayoutInfo with detected regions and flags. """ - if self.model is None: - raise RuntimeError("Model not loaded. Call load_model() first.") - - # Run prediction - results = self.model.predict( - image, - imgsz=image_size, - conf=self.confidence_threshold, - device=settings.device, - ) + layout_detector = self._get_layout_detector() + result = layout_detector.predict(image) + # Parse the result regions: list[LayoutRegion] = [] - has_plain_text = False - has_formula = False + mixed_recognition = False - if results and len(results) > 0: - result = results[0] - if result.boxes is not None: - for box in result.boxes: - cls_id = int(box.cls[0].item()) - confidence = float(box.conf[0].item()) - bbox = box.xyxy[0].tolist() + # Handle result format: [{'input_path': ..., 'page_index': None, 'boxes': [...]}] + if isinstance(result, list) and len(result) > 0: + first_result = result[0] + if isinstance(first_result, dict) and "boxes" in first_result: + boxes = first_result.get("boxes", []) + else: + boxes = [] + else: + boxes = [] - class_name = self.CLASS_NAMES.get(cls_id, f"unknown_{cls_id}") + for box in boxes: + cls_id = box.get("cls_id") + label = box.get("label") or self.CLS_ID_TO_LABEL.get(cls_id, "other") + score = box.get("score", 0.0) + coordinate = box.get("coordinate", [0, 0, 0, 0]) - # Map to simplified type - if class_name in self.PLAIN_TEXT_CLASSES: - region_type = "text" - has_plain_text = True - elif class_name in self.FORMULA_CLASSES: - region_type = "formula" - has_formula = True - elif class_name in {"figure"}: - region_type = "figure" - elif class_name in {"table"}: - region_type = "table" - else: - region_type = class_name + # Normalize label to region type + region_type = self.LABEL_TO_TYPE.get(label, "text") - regions.append( - LayoutRegion( - type=region_type, - bbox=bbox, - confidence=confidence, - ) - ) + regions.append(LayoutRegion( + type=region_type, + bbox=coordinate, + confidence=score, + score=score, + )) - return LayoutInfo( - regions=regions, - has_plain_text=has_plain_text, - has_formula=has_formula, - ) + + mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions) + + return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition) + + +if __name__ == "__main__": + import cv2 + from app.services.image_processor import ImageProcessor + + layout_detector = LayoutDetector() + image_path = "test/timeout.png" + + image = cv2.imread(image_path) + image_processor = ImageProcessor(padding_ratio=0.15) + image = image_processor.add_padding(image) + + # Save the padded image for debugging + cv2.imwrite("debug_padded_image.png", image) + + + layout_info = layout_detector.detect(image) + print(layout_info) \ No newline at end of file diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 8c7fe41..5b65798 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -1,14 +1,12 @@ """PaddleOCR-VL client service for text and formula recognition.""" -import io -import tempfile -from pathlib import Path - -import cv2 import numpy as np - from app.core.config import get_settings -from app.schemas.image import LayoutInfo +from paddleocr import PaddleOCRVL +from typing import Optional +from app.services.layout_detector import LayoutDetector +from app.services.image_processor import ImageProcessor +from app.services.converter import Converter settings = get_settings() @@ -16,52 +14,40 @@ settings = get_settings() class OCRService: """Service for OCR using PaddleOCR-VL.""" - FORMULA_PROMPT = "Please recognize the mathematical formula in this image and output in LaTeX format." + _pipeline: Optional[PaddleOCRVL] = None + _layout_detector: Optional[LayoutDetector] = None def __init__( self, - vl_server_url: str | None = None, - pp_doclayout_model_dir: str | None = None, + vl_server_url: str, + layout_detector: LayoutDetector, + image_processor: ImageProcessor, + converter: Converter, ): """Initialize OCR service. Args: vl_server_url: URL of the vLLM server for PaddleOCR-VL. - pp_doclayout_model_dir: Path to PP-DocLayoutV2 model directory. + layout_detector: Layout detector instance. + image_processor: Image processor instance. """ self.vl_server_url = vl_server_url or settings.paddleocr_vl_url - self.pp_doclayout_model_dir = pp_doclayout_model_dir or settings.pp_doclayout_model_dir - self._pipeline = None - - def _get_pipeline(self): + self.layout_detector = layout_detector + self.image_processor = image_processor + self.converter = converter + def _get_pipeline(self): """Get or create PaddleOCR-VL pipeline. Returns: PaddleOCRVL pipeline instance. """ - if self._pipeline is None: - from paddleocr import PaddleOCRVL - - self._pipeline = PaddleOCRVL( + if OCRService._pipeline is None: + OCRService._pipeline = PaddleOCRVL( vl_rec_backend="vllm-server", vl_rec_server_url=self.vl_server_url, layout_detection_model_name="PP-DocLayoutV2", - layout_detection_model_dir=self.pp_doclayout_model_dir, ) - return self._pipeline - - def _save_temp_image(self, image: np.ndarray) -> str: - """Save image to a temporary file. - - Args: - image: Image as numpy array in BGR format. - - Returns: - Path to temporary file. - """ - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - cv2.imwrite(f.name, image) - return f.name + return OCRService._pipeline def recognize_mixed(self, image: np.ndarray) -> dict: """Recognize mixed content (text + formulas) using PP-DocLayoutV2. @@ -77,30 +63,21 @@ class OCRService: """ try: pipeline = self._get_pipeline() - temp_path = self._save_temp_image(image) - try: - results = list(pipeline.predict(temp_path)) + output = pipeline.predict(image, use_layout_detection=True) - markdown_content = "" - for result in results: - # PaddleOCR-VL results can be saved to markdown - md_buffer = io.StringIO() - result.save_to_markdown(save_path=md_buffer) - markdown_content += md_buffer.getvalue() + markdown_content = "" - # Convert markdown to other formats - latex = self._markdown_to_latex(markdown_content) - mathml = self._extract_mathml(markdown_content) + for res in output: + markdown_content += res.markdown.get("markdown_texts", "") - return { - "markdown": markdown_content, - "latex": latex, - "mathml": mathml, - } - finally: - Path(temp_path).unlink(missing_ok=True) + convert_result = self.converter.convert_to_formats(markdown_content) + return { + "markdown": markdown_content, + "latex": convert_result.latex, + "mathml": convert_result.mathml, + } except Exception as e: raise RuntimeError(f"Mixed recognition failed: {e}") from e @@ -116,188 +93,49 @@ class OCRService: Dict with 'latex', 'markdown', 'mathml' keys. """ try: - import httpx + pipeline = self._get_pipeline() - temp_path = self._save_temp_image(image) + output = pipeline.predict(image, use_layout_detection=False, prompt_label="formula") - try: - # Use vLLM API directly for formula recognition - import base64 + markdown_content = "" - with open(temp_path, "rb") as f: - image_base64 = base64.b64encode(f.read()).decode("utf-8") + for res in output: + markdown_content += res.markdown.get("markdown_texts", "") - # Call vLLM server with formula prompt - response = httpx.post( - f"{self.vl_server_url}/chat/completions", - json={ - "model": "paddleocr-vl", - "messages": [ - { - "role": "user", - "content": [ - {"type": "text", "text": self.FORMULA_PROMPT}, - { - "type": "image_url", - "image_url": {"url": f"data:image/png;base64,{image_base64}"}, - }, - ], - } - ], - "max_tokens": 1024, - }, - timeout=60.0, - ) - response.raise_for_status() - result = response.json() + convert_result = self.converter.convert_to_formats(markdown_content) - latex = result["choices"][0]["message"]["content"].strip() - - # Convert latex to other formats - markdown = self._latex_to_markdown(latex) - mathml = self._latex_to_mathml(latex) - - return { - "latex": latex, - "markdown": markdown, - "mathml": mathml, - } - finally: - Path(temp_path).unlink(missing_ok=True) - - except httpx.HTTPStatusError as e: - raise RuntimeError(f"Formula recognition failed: HTTP {e.response.status_code}") from e + return { + "latex": convert_result.latex, + "mathml": convert_result.mathml, + "markdown": markdown_content, + } except Exception as e: raise RuntimeError(f"Formula recognition failed: {e}") from e - def recognize(self, image: np.ndarray, layout_info: LayoutInfo) -> dict: - """Recognize content based on layout detection results. + def recognize(self, image: np.ndarray) -> dict: + """Recognize content using PaddleOCR-VL. Args: image: Input image as numpy array in BGR format. - layout_info: Layout detection results. Returns: - Dict with recognition results including mode used. + Dict with 'latex', 'markdown', 'mathml' keys. """ - # Decision logic: - # - If plain text exists -> use mixed_recognition (PP-DocLayoutV2) - # - Otherwise -> use formula_recognition (VL with prompt) - if layout_info.has_plain_text: - result = self.recognize_mixed(image) - result["recognition_mode"] = "mixed_recognition" + padded_image = self.image_processor.add_padding(image) + layout_info = self.layout_detector.detect(padded_image) + if layout_info.MixedRecognition: + return self.recognize_mixed(image) else: - result = self.recognize_formula(image) - result["recognition_mode"] = "formula_recognition" + return self.recognize_formula(image) - return result - def _markdown_to_latex(self, markdown: str) -> str: - """Convert markdown to LaTeX. - - Simple conversion - wraps content in LaTeX document structure. - - Args: - markdown: Markdown content. - - Returns: - LaTeX representation. - """ - # Basic conversion: preserve math blocks, convert structure - lines = [] - in_code_block = False - - for line in markdown.split("\n"): - if line.startswith("```"): - in_code_block = not in_code_block - if in_code_block: - lines.append("\\begin{verbatim}") - else: - lines.append("\\end{verbatim}") - elif in_code_block: - lines.append(line) - elif line.startswith("# "): - lines.append(f"\\section{{{line[2:]}}}") - elif line.startswith("## "): - lines.append(f"\\subsection{{{line[3:]}}}") - elif line.startswith("### "): - lines.append(f"\\subsubsection{{{line[4:]}}}") - elif line.startswith("- "): - lines.append(f"\\item {line[2:]}") - elif line.startswith("$$"): - lines.append(line.replace("$$", "\\[").replace("$$", "\\]")) - elif "$" in line: - # Keep inline math as-is - lines.append(line) - else: - lines.append(line) - - return "\n".join(lines) - - def _latex_to_markdown(self, latex: str) -> str: - """Convert LaTeX to markdown. - - Args: - latex: LaTeX content. - - Returns: - Markdown representation. - """ - # Wrap LaTeX in markdown math block - if latex.strip(): - return f"$$\n{latex}\n$$" - return "" - - def _latex_to_mathml(self, latex: str) -> str: - """Convert LaTeX to MathML. - - Args: - latex: LaTeX content. - - Returns: - MathML representation. - """ - # Basic LaTeX to MathML conversion - # For production, consider using latex2mathml library - if not latex.strip(): - return "" - - try: - # Try to use latex2mathml if available - from latex2mathml.converter import convert - - return convert(latex) - except ImportError: - # Fallback: wrap in basic MathML structure - return f'{latex}' - except Exception: - return f'{latex}' - - def _extract_mathml(self, markdown: str) -> str: - """Extract and convert math from markdown to MathML. - - Args: - markdown: Markdown content. - - Returns: - MathML for any math content found. - """ - import re - - # Find all math blocks - math_blocks = re.findall(r"\$\$(.*?)\$\$", markdown, re.DOTALL) - inline_math = re.findall(r"\$([^$]+)\$", markdown) - - all_math = math_blocks + inline_math - - if not all_math: - return "" - - # Convert each to MathML and combine - mathml_parts = [] - for latex in all_math: - mathml = self._latex_to_mathml(latex.strip()) - if mathml: - mathml_parts.append(mathml) - - return "\n".join(mathml_parts) +if __name__ == "__main__": + import cv2 + from app.services.image_processor import ImageProcessor + from app.services.layout_detector import LayoutDetector + image_processor = ImageProcessor(padding_ratio=0.15) + layout_detector = LayoutDetector() + ocr_service = OCRService(image_processor=image_processor, layout_detector=layout_detector) + image = cv2.imread("test/image.png") + ocr_result = ocr_service.recognize(image) + print(ocr_result) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 92c9177..50a6860 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,30 +2,36 @@ name = "doc-processer" version = "0.1.0" description = "Document processing API - Image to LaTeX/Markdown/MathML and Markdown to DOCX" -readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.10" license = { text = "MIT" } authors = [ { name = "YogeLiu" } ] dependencies = [ - "fastapi>=0.115.0", - "uvicorn[standard]>=0.32.0", - "opencv-python>=4.10.0", - "python-multipart>=0.0.12", - "pydantic>=2.10.0", - "pydantic-settings>=2.6.0", - "httpx>=0.28.0", - "numpy>=1.26.0", - "pillow>=10.4.0", - "python-docx>=1.1.0", - "paddleocr>=2.9.0", - "doclayout-yolo>=0.0.2", - "latex2mathml>=3.77.0", - "paddle>=1.2.0", + "fastapi==0.128.0", + "uvicorn[standard]==0.40.0", + "opencv-python==4.12.0.88", + "python-multipart==0.0.21", + "pydantic==2.12.5", + "pydantic-settings==2.12.0", + "httpx==0.28.1", + "numpy==2.2.6", + "pillow==12.0.0", + "python-docx==1.2.0", + "paddleocr==3.3.2", + "doclayout-yolo==0.0.4", + "latex2mathml==3.78.1", + "paddle==1.2.0", + "pypandoc==1.16.2", + "paddlepaddle", + "paddleocr[doc-parser]", + "safetensors" ] +[tool.uv.sources] +paddlepaddle = { path = "wheels/paddlepaddle-3.4.0.dev20251224-cp310-cp310-linux_x86_64.whl" } + [project.optional-dependencies] dev = [ "pytest>=8.0.0",