chore: optimize prompt

2026-03-10 21:36:35 +08:00
parent d98fa7237c
commit a9d3a35dd7
1 changed files with 7 additions and 22 deletions
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -150,9 +150,7 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
            # Strategy: remove spaces before \ and between non-command chars,
            # but preserve the space after \command when followed by a non-\ char
            cleaned = re.sub(r"\s+(?=\\)", "", content)  # remove space before \cmd
-            cleaned = re.sub(
+            cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned)  # remove space after non-letter non-\
                r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned
            )  # remove space after non-letter non-\
        return f"{operator}{{{cleaned}}}"
    # Match _{ ... } or ^{ ... }
@@ -630,9 +628,7 @@ class MineruOCRService(OCRServiceBase):
        self.glm_ocr_url = glm_ocr_url
        self.openai_client = OpenAI(api_key="EMPTY", base_url=glm_ocr_url, timeout=3600)
-    def _recognize_formula_with_paddleocr_vl(
+    def _recognize_formula_with_paddleocr_vl(self, image: np.ndarray, prompt: str = "Formula Recognition:") -> str:
        self, image: np.ndarray, prompt: str = "Formula Recognition:"
    ) -> str:
        """Recognize formula using PaddleOCR-VL API.
        Args:
@@ -673,9 +669,7 @@ class MineruOCRService(OCRServiceBase):
        except Exception as e:
            raise RuntimeError(f"PaddleOCR-VL formula recognition failed: {e}") from e
-    def _extract_and_recognize_formulas(
+    def _extract_and_recognize_formulas(self, markdown_content: str, original_image: np.ndarray) -> str:
        self, markdown_content: str, original_image: np.ndarray
    ) -> str:
        """Extract image references from markdown and recognize formulas.
        Args:
@@ -757,9 +751,7 @@ class MineruOCRService(OCRServiceBase):
                markdown_content = result["results"]["image"].get("md_content", "")
            if "![](images/" in markdown_content:
-                markdown_content = self._extract_and_recognize_formulas(
+                markdown_content = self._extract_and_recognize_formulas(markdown_content, original_image)
                    markdown_content, original_image
                )
            # Apply postprocessing to fix OCR errors
            markdown_content = _postprocess_markdown(markdown_content)
@@ -789,15 +781,11 @@ class MineruOCRService(OCRServiceBase):
 # Task-specific prompts (from GLM-OCR SDK config.yaml)
 _TASK_PROMPTS: dict[str, str] = {
-    "text": "Text Recognition:",
+    "text": "Text Recognition. If the content is a formula, please ouput latex code, else output text",
    "formula": "Formula Recognition:",
    "table": "Table Recognition:",
 }
-_DEFAULT_PROMPT = (
+_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput latex code, else output text"
    "Recognize the text in the image and output in Markdown format. "
    "Preserve the original layout (headings/paragraphs/tables/formulas). "
    "Do not fabricate content that does not exist in the image."
 )
 class GLMOCREndToEndService(OCRServiceBase):
@@ -921,10 +909,7 @@ class GLMOCREndToEndService(OCRServiceBase):
                # Parallel OCR calls
                raw_results: dict[int, str] = {}
                with ThreadPoolExecutor(max_workers=min(self.max_workers, len(tasks))) as ex:
-                    future_map = {
+                    future_map = {ex.submit(self._call_vllm, cropped, prompt): idx for idx, region, cropped, prompt in tasks}
                        ex.submit(self._call_vllm, cropped, prompt): idx
                        for idx, region, cropped, prompt in tasks
                    }
                    for future in as_completed(future_map):
                        idx = future_map[future]
                        try: