feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions

View File

@@ -70,7 +70,7 @@ func (h *OSSEndpoint) GetSignatureURL(ctx *gin.Context) {
ctx.JSON(http.StatusOK, common.ErrorResponse(ctx, common.CodeParamError, "invalid file name"))
return
}
if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"}) {
if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ".pdf"}) {
ctx.JSON(http.StatusOK, common.ErrorResponse(ctx, common.CodeParamError, "invalid file type"))
return
}