fix: encode non-ASCII filename in Content-Disposition header

Use RFC 5987 filename*=UTF-8'' percent-encoding to support Chinese and other Unicode characters in DOCX download filenames. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
fix: single dollar sysmpol
2026-03-13 17:41:18 +08:00 · 2026-03-12 23:20:14 +08:00 · 2026-03-12 22:30:27 +08:00 · 2026-03-12 12:41:43 +08:00 · 2026-03-12 12:41:26 +08:00 · 2026-03-12 12:38:18 +08:00
12 changed files with 2988 additions and 63 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -8,7 +8,8 @@
      "WebFetch(domain:raw.githubusercontent.com)",
      "Bash(python -c \"\nfrom app.services.glm_postprocess import GLMResultFormatter, clean_repeated_content, clean_formula_number\nf = GLMResultFormatter\\(\\)\nprint\\('GLMResultFormatter OK'\\)\nprint\\('clean_formula_number:', clean_formula_number\\('\\(2.1\\)'\\)\\)\nregions = [\n    {'index': 0, 'label': 'text', 'native_label': 'doc_title', 'content': 'Introduction', 'bbox_2d': [10,10,990,50]},\n    {'index': 1, 'label': 'formula', 'native_label': 'display_formula', 'content': r'\\\\frac{a}{b}', 'bbox_2d': [10,60,990,200]},\n    {'index': 2, 'label': 'text', 'native_label': 'formula_number', 'content': '\\(1\\)', 'bbox_2d': [900,60,990,200]},\n]\nmd = f.process\\(regions\\)\nprint\\('process output:'\\)\nprint\\(md\\)\n\" 2>&1 | grep -v \"^$\")",
      "Bash(python3 -c \"\nfrom app.services.glm_postprocess import GLMResultFormatter, clean_repeated_content, clean_formula_number\nf = GLMResultFormatter\\(\\)\nprint\\('GLMResultFormatter OK'\\)\nprint\\('clean_formula_number:', clean_formula_number\\('\\(2.1\\)'\\)\\)\nregions = [\n    {'index': 0, 'label': 'text', 'native_label': 'doc_title', 'content': 'Introduction', 'bbox_2d': [10,10,990,50]},\n    {'index': 1, 'label': 'formula', 'native_label': 'display_formula', 'content': r'\\\\frac{a}{b}', 'bbox_2d': [10,60,990,200]},\n    {'index': 2, 'label': 'text', 'native_label': 'formula_number', 'content': '\\(1\\)', 'bbox_2d': [900,60,990,200]},\n]\nmd = f.process\\(regions\\)\nprint\\('process output:'\\)\nprint\\(repr\\(md\\)\\)\n\" 2>&1)",
-      "Bash(ls .venv 2>/dev/null || ls venv 2>/dev/null || echo \"no venv found\" && find . -name \"activate\" -path \"*/bin/activate\" 2>/dev/null | head -3)"
+      "Bash(ls .venv 2>/dev/null || ls venv 2>/dev/null || echo \"no venv found\" && find . -name \"activate\" -path \"*/bin/activate\" 2>/dev/null | head -3)",
      "Bash(ruff check:*)"
    ]
  }
 }
--- a/app/api/v1/endpoints/convert.py
+++ b/app/api/v1/endpoints/convert.py
@@ -1,12 +1,17 @@
 """Format conversion endpoints."""
 from urllib.parse import quote
 from fastapi import APIRouter, Depends, HTTPException
 from fastapi.responses import Response
 from app.core.dependencies import get_converter
 from app.core.logging_config import get_logger
 from app.schemas.convert import LatexToOmmlRequest, LatexToOmmlResponse, MarkdownToDocxRequest
 from app.services.converter import Converter
 logger = get_logger()
 router = APIRouter()
@@ -19,14 +24,26 @@ async def convert_markdown_to_docx(
    Returns the generated DOCX file as a binary response.
    """
    logger.info(
        "Converting markdown to DOCX, filename=%s, content_length=%d",
        request.filename,
        len(request.markdown),
    )
    try:
        docx_bytes = converter.export_to_file(request.markdown, export_type="docx")
        logger.info(
            "DOCX conversion successful, filename=%s, size=%d bytes",
            request.filename,
            len(docx_bytes),
        )
        encoded_name = quote(f"{request.filename}.docx")
        return Response(
            content=docx_bytes,
            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-            headers={"Content-Disposition": f'attachment; filename="{request.filename}.docx"'},
+            headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_name}"},
        )
    except Exception as e:
        logger.exception("DOCX conversion failed, filename=%s: %s", request.filename, e)
        raise HTTPException(status_code=500, detail=f"Conversion failed: {e}")
@@ -55,12 +72,17 @@ async def convert_latex_to_omml(
        ```
    """
    if not request.latex or not request.latex.strip():
        logger.warning("LaTeX to OMML request received with empty formula")
        raise HTTPException(status_code=400, detail="LaTeX formula cannot be empty")
    logger.info("Converting LaTeX to OMML, latex=%r", request.latex)
    try:
        omml = converter.convert_to_omml(request.latex)
        logger.info("LaTeX to OMML conversion successful")
        return LatexToOmmlResponse(omml=omml)
    except ValueError as e:
        logger.warning("LaTeX to OMML conversion invalid input: %s", e)
        raise HTTPException(status_code=400, detail=str(e))
    except RuntimeError as e:
        logger.error("LaTeX to OMML conversion runtime error: %s", e)
        raise HTTPException(status_code=503, detail=str(e))
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -50,9 +50,7 @@ class Settings(BaseSettings):
    max_tokens: int = 4096
    # Model Paths
-    pp_doclayout_model_dir: str | None = (
+    pp_doclayout_model_dir: str | None = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV3"
        "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV3"
    )
    # Image Processing
    max_image_size_mb: int = 10
--- a/app/core/logging_config.py
+++ b/app/core/logging_config.py
@@ -2,11 +2,15 @@
 import logging
 import logging.handlers
 from contextvars import ContextVar
 from pathlib import Path
 from typing import Any
 from app.core.config import get_settings
 # Context variable to hold the current request_id across async boundaries
 request_id_ctx: ContextVar[str] = ContextVar("request_id", default="-")
 class TimedRotatingAndSizeFileHandler(logging.handlers.TimedRotatingFileHandler):
    """File handler that rotates by both time (daily) and size (100MB)."""
@@ -92,14 +96,13 @@ def setup_logging(log_dir: str | None = None) -> logging.Logger:
    # Remove existing handlers to avoid duplicates
    logger.handlers.clear()
-    # Create custom formatter that handles missing request_id
+    # Create custom formatter that automatically injects request_id from context
    class RequestIDFormatter(logging.Formatter):
-        """Formatter that handles request_id in log records."""
+        """Formatter that injects request_id from ContextVar into log records."""
        def format(self, record):
            # Add request_id if not present
            if not hasattr(record, "request_id"):
-                record.request_id = getattr(record, "request_id", "unknown")
+                record.request_id = request_id_ctx.get()
            return super().format(record)
    formatter = RequestIDFormatter(
@@ -138,7 +141,7 @@ _logger: logging.Logger | None = None
 def get_logger() -> logging.Logger:
-    """Get the global logger instance."""
+    """Get the global logger instance, initializing if needed."""
    global _logger
    if _logger is None:
        _logger = setup_logging()
--- a/app/main.py
+++ b/app/main.py
@@ -8,6 +8,7 @@ from app.api.v1.router import api_router
 from app.core.config import get_settings
 from app.core.dependencies import init_layout_detector
 from app.core.logging_config import setup_logging
 from app.middleware.request_id import RequestIDMiddleware
 settings = get_settings()
@@ -33,6 +34,8 @@ app = FastAPI(
    lifespan=lifespan,
 )
 app.add_middleware(RequestIDMiddleware)
 # Include API router
 app.include_router(api_router, prefix=settings.api_prefix)
--- a/app/middleware/init.py
+++ b/app/middleware/init.py
--- a/app/middleware/request_id.py
+++ b/app/middleware/request_id.py
@@ -0,0 +1,34 @@
 """Middleware to propagate or generate request_id for every request."""
 import uuid
 from starlette.middleware.base import BaseHTTPMiddleware
 from starlette.requests import Request
 from starlette.responses import Response
 from app.core.logging_config import request_id_ctx
 REQUEST_ID_HEADER = "X-Request-ID"
 class RequestIDMiddleware(BaseHTTPMiddleware):
    """Extract X-Request-ID from incoming request headers or generate one.
    The request_id is stored in a ContextVar so that all log records emitted
    during the request are automatically annotated with it, without needing to
    pass it explicitly through every call.
    The same request_id is also echoed back in the response header so that
    callers can correlate logs.
    """
    async def dispatch(self, request: Request, call_next) -> Response:
        request_id = request.headers.get(REQUEST_ID_HEADER) or str(uuid.uuid4())
        token = request_id_ctx.set(request_id)
        try:
            response = await call_next(request)
        finally:
            request_id_ctx.reset(token)
        response.headers[REQUEST_ID_HEADER] = request_id
        return response
--- a/app/services/glm_postprocess.py
+++ b/app/services/glm_postprocess.py
@@ -34,13 +34,7 @@ def find_consecutive_repeat(s: str, min_unit_len: int = 10, min_repeats: int = 1
        return None
    pattern = re.compile(
-        r"(.{"
+        r"(.{" + str(min_unit_len) + "," + str(max_unit_len) + r"}?)\1{" + str(min_repeats - 1) + ",}",
        + str(min_unit_len)
        + ","
        + str(max_unit_len)
        + r"}?)\1{"
        + str(min_repeats - 1)
        + ",}",
        re.DOTALL,
    )
    match = pattern.search(s)
@@ -74,9 +68,7 @@ def clean_repeated_content(
        if count >= line_threshold and (count / total_lines) >= 0.8:
            for i, line in enumerate(lines):
                if line == common:
-                    consecutive = sum(
+                    consecutive = sum(1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common)
                        1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common
                    )
                    if consecutive >= 3:
                        original_lines = content.split("\n")
                        non_empty_count = 0
@@ -113,6 +105,11 @@ def clean_formula_number(number_content: str) -> str:
 # GLMResultFormatter
 # ---------------------------------------------------------------------------
 # Matches content that consists *entirely* of a display-math block and nothing else.
 # Used to detect when a text/heading region was actually recognised as a formula by vLLM,
 # so we can correct the label before heading prefixes (## …) are applied.
 _PURE_DISPLAY_FORMULA_RE = re.compile(r"^\s*(?:\$\$[\s\S]+?\$\$|\\\[[\s\S]+?\\\])\s*$")
 # Label → canonical category mapping (mirrors GLM-OCR label_visualization_mapping)
 _LABEL_TO_CATEGORY: dict[str, str] = {
    # text
@@ -173,6 +170,19 @@ class GLMResultFormatter:
            item["native_label"] = item.get("native_label", item.get("label", "text"))
            item["label"] = self._map_label(item.get("label", "text"), item["native_label"])
            # Label correction: layout may say "text" (or a heading like "paragraph_title")
            # but vLLM recognised the content as a formula and returned $$…$$.  Without
            # correction the heading prefix (##) would be prepended to the math block,
            # producing broken output like "## $$ \mathbf{y}=… $$".
            raw_content = (item.get("content") or "").strip()
            if item["label"] == "text" and _PURE_DISPLAY_FORMULA_RE.match(raw_content):
                logger.debug(
                    "Label corrected text (native=%s) → formula: pure display-formula detected",
                    item["native_label"],
                )
                item["label"] = "formula"
                item["native_label"] = "display_formula"
            item["content"] = self._format_content(
                item.get("content") or "",
                item["label"],
@@ -255,16 +265,14 @@ class GLMResultFormatter:
        # Formula wrapping
        if label == "formula":
            content = content.strip()
-            for s, e in [("$$", "$$"), (r"\[", r"\]"), (r"\(", r"\)")]:
+            for s, e in [("$$", "$$"), (r"\[", r"\]"), (r"\(", r"\)"), ("$", "$")]:
                if content.startswith(s):
                    content = content[len(s) :].strip()
                    if content.endswith(e):
                        content = content[: -len(e)].strip()
                    break
            if not content:
-                logger.warning(
+                logger.warning("Skipping formula region with empty content after stripping delimiters")
                    "Skipping formula region with empty content after stripping delimiters"
                )
                return ""
            content = "$$\n" + content + "\n$$"
@@ -314,9 +322,7 @@ class GLMResultFormatter:
                    formula_content = items[i + 1].get("content", "")
                    merged_block = deepcopy(items[i + 1])
                    if formula_content.endswith("\n$$"):
-                        merged_block["content"] = (
+                        merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
                            formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
                        )
                    merged.append(merged_block)
                    skip.add(i + 1)
                continue  # always skip the formula_number block itself
@@ -328,9 +334,7 @@ class GLMResultFormatter:
                    formula_content = block.get("content", "")
                    merged_block = deepcopy(block)
                    if formula_content.endswith("\n$$"):
-                        merged_block["content"] = (
+                        merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
                            formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
                        )
                    merged.append(merged_block)
                    skip.add(i + 1)
                    continue
@@ -390,9 +394,7 @@ class GLMResultFormatter:
            block["index"] = i
        return merged
-    def _format_bullet_points(
+    def _format_bullet_points(self, items: list[dict], left_align_threshold: float = 10.0) -> list[dict]:
        self, items: list[dict], left_align_threshold: float = 10.0
    ) -> list[dict]:
        """Add missing bullet prefix when a text block is sandwiched between two bullet items."""
        if len(items) < 3:
            return items
@@ -422,10 +424,7 @@ class GLMResultFormatter:
            if not (cur_bbox and prev_bbox and nxt_bbox):
                continue
-            if (
+            if abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold:
                abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold
                and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold
            ):
                cur["content"] = "- " + cur_content
        return items
--- a/app/services/layout_detector.py
+++ b/app/services/layout_detector.py
@@ -148,7 +148,7 @@ class LayoutDetector:
                )
            )
-        mixed_recognition = any(region.type == "text" and region.score > 0.3 for region in regions)
+        mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions)
        return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition)
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -150,9 +150,7 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
            # Strategy: remove spaces before \ and between non-command chars,
            # but preserve the space after \command when followed by a non-\ char
            cleaned = re.sub(r"\s+(?=\\)", "", content)  # remove space before \cmd
-            cleaned = re.sub(
+            cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned)  # remove space after non-letter non-\
                r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned
            )  # remove space after non-letter non-\
        return f"{operator}{{{cleaned}}}"
    # Match _{ ... } or ^{ ... }
@@ -630,9 +628,7 @@ class MineruOCRService(OCRServiceBase):
        self.glm_ocr_url = glm_ocr_url
        self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600)
-    def _recognize_formula_with_paddleocr_vl(
+    def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
        self, image: np.ndarray, prompt: str = "Formula Recognition:"
    ) -> str:
        """Recognize formula using PaddleOCR-VL API.
        Args:
@@ -673,9 +669,7 @@ class MineruOCRService(OCRServiceBase):
        except Exception as e:
            raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
-    def _extract_and_recognize_formulas(
+    def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
        self, markdown_content: str, original_image: np.ndarray
    ) -> str:
        """Extract image references from markdown and recognize formulas.
        Args:
@@ -757,9 +751,7 @@ class MineruOCRService(OCRServiceBase):
                markdown_content = result["results"]["image"].get("md_content", "")
            if "![](images/" in markdown_content:
-                markdown_content = self._extract_and_recognize_formulas(
+                markdown_content = self._extract_and_recognize_formulas(markdown_content, original_image)
                    markdown_content, original_image
                )
            # Apply postprocessing to fix OCR errors
            markdown_content = _postprocess_markdown(markdown_content)
@@ -789,15 +781,11 @@ class MineruOCRService(OCRServiceBase):
 # Task-specific prompts (from GLM-OCR SDK config.yaml)
 _TASK_PROMPTS: dict[str, str] = {
-    "text": "Text Recognition:",
+    "text": "Text Recognition. If the content is a formula, please output display latex code, else output text",
    "formula": "Formula Recognition:",
    "table": "Table Recognition:",
 }
-_DEFAULT_PROMPT = (
+_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please output display latex code, else output text"
    "Recognize the text in the image and output in Markdown format. "
    "Preserve the original layout (headings/paragraphs/tables/formulas). "
    "Do not fabricate content that does not exist in the image."
 )
 class GLMOCREndToEndService(OCRServiceBase):
@@ -880,13 +868,14 @@ class GLMOCREndToEndService(OCRServiceBase):
        """
        # 1. Layout detection
        img_h, img_w = image.shape[:2]
-        layout_info = self.layout_detector.detect(image)
+        padded_image = self.image_processor.add_padding(image)
        layout_info = self.layout_detector.detect(padded_image)
        # Sort regions in reading order: top-to-bottom, left-to-right
        layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0]))
        # 3. OCR: per-region (parallel) or full-image fallback
-        if not layout_info.regions:
+        if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition):
            # No layout detected → assume it's a formula, use formula recognition
            logger.info("No layout regions detected, treating image as formula")
            raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])
@@ -902,7 +891,7 @@ class GLMOCREndToEndService(OCRServiceBase):
                if region.type == "figure":
                    continue
                x1, y1, x2, y2 = (int(c) for c in region.bbox)
-                cropped = image[y1:y2, x1:x2]
+                cropped = padded_image[y1:y2, x1:x2]
                if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10:
                    logger.warning(
                        "Skipping region idx=%d (label=%s): crop too small %s",
@@ -921,10 +910,7 @@ class GLMOCREndToEndService(OCRServiceBase):
                # Parallel OCR calls
                raw_results: dict[int, str] = {}
                with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex:
-                    future_map = {
+                    future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks}
                        ex.submit(self._call_vllm, cropped, prompt): idx
                        for idx, region, cropped, prompt in tasks
                    }
                    for future in as_completed(future_map):
                        idx = future_map[future]
                        try:
--- a/nohup.out
+++ b/nohup.out
--- a/tests/tools/layout.py
+++ b/tests/tools/layout.py
@@ -0,0 +1,35 @@
 import cv2
 from app.core.config import get_settings
 from app.services.layout_detector import LayoutDetector
 settings = get_settings()
 def debug_layout_detector():
    layout_detector = LayoutDetector()
    image = cv2.imread("test/image2.png")
    print(f"Image shape: {image.shape}")
    # padded_image = ImageProcessor(padding_ratio=0.15).add_padding(image)
    layout_info = layout_detector.detect(image)
    # draw the layout info and label
    for region in layout_info.regions:
        x1, y1, x2, y2 = region.bbox
        cv2.putText(
            image,
            region.native_label,
            (int(x1), int(y1)),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 255),
            2,
        )
        cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 0, 255), 2)
    cv2.imwrite("test/layout_debug.png", image)
 if __name__ == "__main__":
    debug_layout_detector()
Author	SHA1	Message	Date
liuyuanchuang	39e72a5743	fix: encode non-ASCII filename in Content-Disposition header Use RFC 5987 filename*=UTF-8'' percent-encoding to support Chinese and other Unicode characters in DOCX download filenames. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-03-13 17:41:18 +08:00
yoge	aee1a1bf3b	fix: single dollar sysmpol	2026-03-12 23:20:14 +08:00
yoge	ff82021467	optimize: formula is recognize text	2026-03-12 22:30:27 +08:00
liuyuanchuang	11e9ed780d	Merge branch 'main' of https://code.texpixel.com/YogeLiu/doc_processer	2026-03-12 12:41:43 +08:00
liuyuanchuang	d1050acbdc	fix: looger path	2026-03-12 12:41:26 +08:00
yogeliu	16399f0929	fix: logger path	2026-03-12 12:38:18 +08:00
liuyuanchuang	92b56d61d8	feat: add log for export api	2026-03-12 11:40:19 +08:00
yoge	bb1cf66137	fix: optimize title to formula	2026-03-10 21:45:43 +08:00
yoge	a9d3a35dd7	chore: optimize prompt	2026-03-10 21:36:35 +08:00