feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions

View File

@@ -169,18 +169,21 @@ func (s *RecognitionService) GetFormualTask(ctx context.Context, taskNo string)
return nil, common.NewError(common.CodeDBError, "查询任务结果失败", err)
}
// 构建 Markdown 格式
markdown := taskRet.Markdown
if markdown == "" {
markdown = fmt.Sprintf("$$%s$$", taskRet.Latex)
formulaContent, err := taskRet.GetFormulaContent()
if err != nil {
log.Error(ctx, "func", "GetFormualTask", "msg", "解析公式内容失败", "error", err)
return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err)
}
markdown := formulaContent.Markdown
if markdown == "" {
markdown = fmt.Sprintf("$$%s$$", formulaContent.Latex)
}
return &formula.GetFormulaTaskResponse{
TaskNo: taskNo,
Latex: taskRet.Latex,
Latex: formulaContent.Latex,
Markdown: markdown,
MathML: taskRet.MathML,
MML: taskRet.MML,
MathML: formulaContent.MathML,
MML: formulaContent.MML,
Status: int(task.Status),
}, nil
}
@@ -539,14 +542,26 @@ func (s *RecognitionService) processFormulaTask(ctx context.Context, taskID int6
log.Error(ctx, "func", "processFormulaTask", "msg", "解析响应JSON失败", "error", err)
return err
}
err = resultDao.Create(tx, dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: ocrResp.Latex,
Markdown: ocrResp.Markdown,
MathML: ocrResp.MathML,
MML: ocrResp.MML,
})
if err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
result := dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
Content: contentJSON,
}
if err = result.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化MetaData失败", "error", err)
return err
}
err = resultDao.Create(tx, result)
if err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "保存任务结果失败", "error", err)
return err
@@ -662,15 +677,25 @@ func (s *RecognitionService) processVLFormulaTask(ctx context.Context, taskID in
return err
}
if result == nil {
formulaRes := &dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Latex: latex}
err = resultDao.Create(dao.DB.WithContext(ctx), *formulaRes)
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
result.Latex = latex
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"latex": latex})
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"content": contentJSON})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "更新任务结果失败", "error", err)
return err
@@ -851,23 +876,35 @@ func (s *RecognitionService) processMathpixTask(ctx context.Context, taskID int6
if result == nil {
// 创建新结果
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: mathpixResp.LatexStyled,
Markdown: mathpixResp.Text,
MathML: mathpixResp.GetMathML(),
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
// 更新现有结果
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: mathpixResp.LatexStyled,
Markdown: mathpixResp.Text,
MathML: mathpixResp.GetMathML(),
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
"latex": mathpixResp.LatexStyled,
"markdown": mathpixResp.Text,
"mathml": mathpixResp.GetMathML(),
"content": contentJSON,
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "更新任务结果失败", "error", err)
@@ -1027,23 +1064,35 @@ func (s *RecognitionService) processBaiduOCRTask(ctx context.Context, taskID int
if result == nil {
// 创建新结果
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Markdown: markdownResult,
Latex: latex,
MathML: mml,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
// 更新现有结果
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Markdown: markdownResult,
Latex: latex,
MathML: mml,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
"markdown": markdownResult,
"latex": latex,
"mathml": mml,
"content": contentJSON,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "更新任务结果失败", "error", err)