feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content), replacing flat latex/markdown/mathml/mml columns - Add TaskTypePDF constant and update all formula read/write paths - Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page rendering; limits processing to first 10 pages (pre-hook) - Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each page image; stores results as [{page_number, markdown}] JSON array - Add Redis queue + distributed lock for PDF worker goroutine - Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no - Add .pdf to OSS upload file type whitelist - Add migrations/pdf_recognition.sql for safe data migration Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions
--- a/internal/service/recognition_service.go
+++ b/internal/service/recognition_service.go
@@ -169,18 +169,21 @@ func (s *RecognitionService) GetFormualTask(ctx context.Context, taskNo string)
 		return nil, common.NewError(common.CodeDBError, "查询任务结果失败", err)
 	}

-	// 构建 Markdown 格式
-	markdown := taskRet.Markdown
-	if markdown == "" {
-		markdown = fmt.Sprintf("$$%s$$", taskRet.Latex)
+	formulaContent, err := taskRet.GetFormulaContent()
+	if err != nil {
+		log.Error(ctx, "func", "GetFormualTask", "msg", "解析公式内容失败", "error", err)
+		return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err)
+	}
+	markdown := formulaContent.Markdown
+	if markdown == "" {
+		markdown = fmt.Sprintf("$$%s$$", formulaContent.Latex)
 	}
-
 	return &formula.GetFormulaTaskResponse{
 		TaskNo:   taskNo,
-		Latex:    taskRet.Latex,
+		Latex:    formulaContent.Latex,
 		Markdown: markdown,
-		MathML:   taskRet.MathML,
-		MML:      taskRet.MML,
+		MathML:   formulaContent.MathML,
+		MML:      formulaContent.MML,
 		Status:   int(task.Status),
 	}, nil
 }
@@ -539,14 +542,26 @@ func (s *RecognitionService) processFormulaTask(ctx context.Context, taskID int6
 		log.Error(ctx, "func", "processFormulaTask", "msg", "解析响应JSON失败", "error", err)
 		return err
 	}
-	err = resultDao.Create(tx, dao.RecognitionResult{
-		TaskID:   taskID,
-		TaskType: dao.TaskTypeFormula,
+	contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
 		Latex:    ocrResp.Latex,
 		Markdown: ocrResp.Markdown,
 		MathML:   ocrResp.MathML,
 		MML:      ocrResp.MML,
 	})
+	if err != nil {
+		log.Error(ctx, "func", "processFormulaTask", "msg", "序列化公式内容失败", "error", err)
+		return err
+	}
+	result := dao.RecognitionResult{
+		TaskID:   taskID,
+		TaskType: dao.TaskTypeFormula,
+		Content:  contentJSON,
+	}
+	if err = result.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
+		log.Error(ctx, "func", "processFormulaTask", "msg", "序列化MetaData失败", "error", err)
+		return err
+	}
+	err = resultDao.Create(tx, result)
 	if err != nil {
 		log.Error(ctx, "func", "processFormulaTask", "msg", "保存任务结果失败", "error", err)
 		return err
@@ -662,15 +677,25 @@ func (s *RecognitionService) processVLFormulaTask(ctx context.Context, taskID in
 		return err
 	}
 	if result == nil {
-		formulaRes := &dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Latex: latex}
-		err = resultDao.Create(dao.DB.WithContext(ctx), *formulaRes)
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
+		if err != nil {
+			log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
+		newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
+		_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
+		err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
 		if err != nil {
 			log.Error(ctx, "func", "processVLFormulaTask", "msg", "创建任务结果失败", "error", err)
 			return err
 		}
 	} else {
-		result.Latex = latex
-		err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"latex": latex})
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
+		if err != nil {
+			log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
+		err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"content": contentJSON})
 		if err != nil {
 			log.Error(ctx, "func", "processVLFormulaTask", "msg", "更新任务结果失败", "error", err)
 			return err
@@ -851,23 +876,35 @@ func (s *RecognitionService) processMathpixTask(ctx context.Context, taskID int6

 	if result == nil {
 		// 创建新结果
-		err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
-			TaskID:   taskID,
-			TaskType: dao.TaskTypeFormula,
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
 			Latex:    mathpixResp.LatexStyled,
 			Markdown: mathpixResp.Text,
 			MathML:   mathpixResp.GetMathML(),
 		})
+		if err != nil {
+			log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
+		newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
+		_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
+		err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
 		if err != nil {
 			log.Error(ctx, "func", "processMathpixTask", "msg", "创建任务结果失败", "error", err)
 			return err
 		}
 	} else {
 		// 更新现有结果
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
+			Latex:    mathpixResp.LatexStyled,
+			Markdown: mathpixResp.Text,
+			MathML:   mathpixResp.GetMathML(),
+		})
+		if err != nil {
+			log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
 		err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
-			"latex":    mathpixResp.LatexStyled,
-			"markdown": mathpixResp.Text,
-			"mathml":   mathpixResp.GetMathML(),
+			"content": contentJSON,
 		})
 		if err != nil {
 			log.Error(ctx, "func", "processMathpixTask", "msg", "更新任务结果失败", "error", err)
@@ -1027,23 +1064,35 @@ func (s *RecognitionService) processBaiduOCRTask(ctx context.Context, taskID int

 	if result == nil {
 		// 创建新结果
-		err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
-			TaskID:   taskID,
-			TaskType: dao.TaskTypeFormula,
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
 			Markdown: markdownResult,
 			Latex:    latex,
 			MathML:   mml,
 		})
+		if err != nil {
+			log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
+		newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
+		_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
+		err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
 		if err != nil {
 			log.Error(ctx, "func", "processBaiduOCRTask", "msg", "创建任务结果失败", "error", err)
 			return err
 		}
 	} else {
 		// 更新现有结果
+		contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
+			Markdown: markdownResult,
+			Latex:    latex,
+			MathML:   mml,
+		})
+		if err != nil {
+			log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
+			return err
+		}
 		err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
-			"markdown": markdownResult,
-			"latex":    latex,
-			"mathml":   mml,
+			"content": contentJSON,
 		})
 		if err != nil {
 			log.Error(ctx, "func", "processBaiduOCRTask", "msg", "更新任务结果失败", "error", err)