feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions

View File

@@ -89,17 +89,17 @@ func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListReque
Total: total,
}
for _, item := range tasks {
var latex string
var markdown string
var mathML string
var mml string
var latex, markdown, mathML, mml string
recognitionResult := recognitionResultMap[item.ID]
if recognitionResult != nil {
latex = recognitionResult.Latex
markdown = recognitionResult.Markdown
mathML = recognitionResult.MathML
mml = recognitionResult.MML
if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula {
if fc, err := recognitionResult.GetFormulaContent(); err == nil {
latex = fc.Latex
markdown = fc.Markdown
mathML = fc.MathML
mml = fc.MML
}
}
// PDF 类型的 TaskListDTO 暂不展开 content列表页只显示状态
originURL, err := oss.GetDownloadURL(ctx, item.FileURL)
if err != nil {
log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err)
@@ -148,10 +148,18 @@ func (svc *TaskService) ExportTask(ctx context.Context, req *task.ExportTaskRequ
return nil, "", errors.New("recognition result not found")
}
markdown := recognitionResult.Markdown
if markdown == "" {
log.Error(ctx, "func", "ExportTask", "msg", "markdown not found")
return nil, "", errors.New("markdown not found")
var markdown string
switch recognitionResult.TaskType {
case dao.TaskTypeFormula:
fc, err := recognitionResult.GetFormulaContent()
if err != nil || fc.Markdown == "" {
log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err)
return nil, "", errors.New("markdown not found")
}
markdown = fc.Markdown
default:
log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType)
return nil, "", errors.New("unsupported task type for export")
}
// 获取文件名(去掉扩展名)