feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -169,18 +169,21 @@ func (s *RecognitionService) GetFormualTask(ctx context.Context, taskNo string)
|
||||
return nil, common.NewError(common.CodeDBError, "查询任务结果失败", err)
|
||||
}
|
||||
|
||||
// 构建 Markdown 格式
|
||||
markdown := taskRet.Markdown
|
||||
if markdown == "" {
|
||||
markdown = fmt.Sprintf("$$%s$$", taskRet.Latex)
|
||||
formulaContent, err := taskRet.GetFormulaContent()
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "GetFormualTask", "msg", "解析公式内容失败", "error", err)
|
||||
return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err)
|
||||
}
|
||||
markdown := formulaContent.Markdown
|
||||
if markdown == "" {
|
||||
markdown = fmt.Sprintf("$$%s$$", formulaContent.Latex)
|
||||
}
|
||||
|
||||
return &formula.GetFormulaTaskResponse{
|
||||
TaskNo: taskNo,
|
||||
Latex: taskRet.Latex,
|
||||
Latex: formulaContent.Latex,
|
||||
Markdown: markdown,
|
||||
MathML: taskRet.MathML,
|
||||
MML: taskRet.MML,
|
||||
MathML: formulaContent.MathML,
|
||||
MML: formulaContent.MML,
|
||||
Status: int(task.Status),
|
||||
}, nil
|
||||
}
|
||||
@@ -539,14 +542,26 @@ func (s *RecognitionService) processFormulaTask(ctx context.Context, taskID int6
|
||||
log.Error(ctx, "func", "processFormulaTask", "msg", "解析响应JSON失败", "error", err)
|
||||
return err
|
||||
}
|
||||
err = resultDao.Create(tx, dao.RecognitionResult{
|
||||
TaskID: taskID,
|
||||
TaskType: dao.TaskTypeFormula,
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Latex: ocrResp.Latex,
|
||||
Markdown: ocrResp.Markdown,
|
||||
MathML: ocrResp.MathML,
|
||||
MML: ocrResp.MML,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
result := dao.RecognitionResult{
|
||||
TaskID: taskID,
|
||||
TaskType: dao.TaskTypeFormula,
|
||||
Content: contentJSON,
|
||||
}
|
||||
if err = result.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
|
||||
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化MetaData失败", "error", err)
|
||||
return err
|
||||
}
|
||||
err = resultDao.Create(tx, result)
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processFormulaTask", "msg", "保存任务结果失败", "error", err)
|
||||
return err
|
||||
@@ -662,15 +677,25 @@ func (s *RecognitionService) processVLFormulaTask(ctx context.Context, taskID in
|
||||
return err
|
||||
}
|
||||
if result == nil {
|
||||
formulaRes := &dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Latex: latex}
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), *formulaRes)
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
|
||||
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processVLFormulaTask", "msg", "创建任务结果失败", "error", err)
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
result.Latex = latex
|
||||
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"latex": latex})
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"content": contentJSON})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processVLFormulaTask", "msg", "更新任务结果失败", "error", err)
|
||||
return err
|
||||
@@ -851,23 +876,35 @@ func (s *RecognitionService) processMathpixTask(ctx context.Context, taskID int6
|
||||
|
||||
if result == nil {
|
||||
// 创建新结果
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
|
||||
TaskID: taskID,
|
||||
TaskType: dao.TaskTypeFormula,
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Latex: mathpixResp.LatexStyled,
|
||||
Markdown: mathpixResp.Text,
|
||||
MathML: mathpixResp.GetMathML(),
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
|
||||
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processMathpixTask", "msg", "创建任务结果失败", "error", err)
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
// 更新现有结果
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Latex: mathpixResp.LatexStyled,
|
||||
Markdown: mathpixResp.Text,
|
||||
MathML: mathpixResp.GetMathML(),
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
|
||||
"latex": mathpixResp.LatexStyled,
|
||||
"markdown": mathpixResp.Text,
|
||||
"mathml": mathpixResp.GetMathML(),
|
||||
"content": contentJSON,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processMathpixTask", "msg", "更新任务结果失败", "error", err)
|
||||
@@ -1027,23 +1064,35 @@ func (s *RecognitionService) processBaiduOCRTask(ctx context.Context, taskID int
|
||||
|
||||
if result == nil {
|
||||
// 创建新结果
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
|
||||
TaskID: taskID,
|
||||
TaskType: dao.TaskTypeFormula,
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Markdown: markdownResult,
|
||||
Latex: latex,
|
||||
MathML: mml,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
|
||||
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
|
||||
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "创建任务结果失败", "error", err)
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
// 更新现有结果
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Markdown: markdownResult,
|
||||
Latex: latex,
|
||||
MathML: mml,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
|
||||
return err
|
||||
}
|
||||
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
|
||||
"markdown": markdownResult,
|
||||
"latex": latex,
|
||||
"mathml": mml,
|
||||
"content": contentJSON,
|
||||
})
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "更新任务结果失败", "error", err)
|
||||
|
||||
Reference in New Issue
Block a user