fix: remove padding from GLMOCREndToEndService and clean up ruff violations

- Drop image padding in GLMOCREndToEndService.recognize(); use raw image directly
- Fix F821 undefined `padded` references replaced with `image`
- Fix F601 duplicate dict key "≠" in converter
- Fix F841 unused `image_cls_ids` variable in layout_postprocess
- Fix E702 semicolon-separated statements in layout_postprocess
- Fix UP031 percent-format replaced with f-string in logging_config
- Auto-fix 44 additional ruff violations (import order, UP035/UP045/UP006, F401, F541)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
liuyuanchuang
2026-03-10 19:52:22 +08:00
parent f8173f7c0a
commit 30d2c2f45b
16 changed files with 162 additions and 140 deletions

View File

@@ -112,14 +112,18 @@ class Converter:
# Pre-compiled regex patterns for preprocessing
_RE_VSPACE = re.compile(r"\\\[1mm\]")
_RE_BLOCK_FORMULA_INLINE = re.compile(r"([^\n])(\s*)\\\[(.*?)\\\]([^\n])", re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL)
_RE_BLOCK_FORMULA_LINE = re.compile(
r"^(\s*)\\\[(.*?)\\\](\s*)(?=\n|$)", re.MULTILINE | re.DOTALL
)
_RE_ARITHMATEX = re.compile(r'<span class="arithmatex">(.*?)</span>')
_RE_INLINE_SPACE = re.compile(r"(?<!\$)\$ +(.+?) +\$(?!\$)")
_RE_ARRAY_SPECIFIER = re.compile(r"\\begin\{array\}\{([^}]+)\}")
_RE_LEFT_BRACE = re.compile(r"\\left\\\{\s+")
_RE_RIGHT_BRACE = re.compile(r"\s+\\right\\\}")
_RE_CASES = re.compile(r"\\begin\{cases\}(.*?)\\end\{cases\}", re.DOTALL)
_RE_ALIGNED_BRACE = re.compile(r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL)
_RE_ALIGNED_BRACE = re.compile(
r"\\left\\\{\\begin\{aligned\}(.*?)\\end\{aligned\}\\right\.", re.DOTALL
)
_RE_ALIGNED = re.compile(r"\\begin\{aligned\}(.*?)\\end\{aligned\}", re.DOTALL)
_RE_TAG = re.compile(r"\$\$(.*?)\\tag\s*\{([^}]+)\}\s*\$\$", re.DOTALL)
_RE_VMATRIX = re.compile(r"\\begin\{vmatrix\}(.*?)\\end\{vmatrix\}", re.DOTALL)
@@ -368,7 +372,9 @@ class Converter:
mathml = latex_to_mathml(latex_formula)
return Converter._postprocess_mathml_for_word(mathml)
except Exception as e:
raise RuntimeError(f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}") from e
raise RuntimeError(
f"MathML conversion failed: {pandoc_error}. latex2mathml fallback also failed: {e}"
) from e
@staticmethod
def _postprocess_mathml_for_word(mathml: str) -> str:
@@ -583,7 +589,6 @@ class Converter:
"&#x21D3;": "", # Downarrow
"&#x2195;": "", # updownarrow
"&#x21D5;": "", # Updownarrow
"&#x2260;": "", # ne
"&#x226A;": "", # ll
"&#x226B;": "", # gg
"&#x2A7D;": "", # leqslant
@@ -962,7 +967,7 @@ class Converter:
"""Export to DOCX format using pypandoc."""
extra_args = [
"--highlight-style=pygments",
f"--reference-doc=app/pkg/reference.docx",
"--reference-doc=app/pkg/reference.docx",
]
pypandoc.convert_file(
input_path,