feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content), replacing flat latex/markdown/mathml/mml columns - Add TaskTypePDF constant and update all formula read/write paths - Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page rendering; limits processing to first 10 pages (pre-hook) - Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each page image; stores results as [{page_number, markdown}] JSON array - Add Redis queue + distributed lock for PDF worker goroutine - Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no - Add .pdf to OSS upload file type whitelist - Add migrations/pdf_recognition.sql for safe data migration Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions
--- a/cmd/migrate/main.go
+++ b/cmd/migrate/main.go
@@ -195,12 +195,27 @@ func migrateData(testDB, prodDB *gorm.DB) error {
 				mathml = *item.MathML
 			}

-			newResult := dao.RecognitionResult{
-				TaskID:   newTask.ID, // 使用新任务的ID
-				TaskType: dao.TaskType(item.TaskType),
+			contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
 				Latex:    latex,
 				Markdown: markdown,
 				MathML:   mathml,
+			})
+			if err != nil {
+				log.Printf("[%d/%d] 序列化公式内容失败: task_id=%d, error=%v", i+1, len(tasksWithResults), newTask.ID, err)
+				tx.Rollback()
+				errorCount++
+				continue
+			}
+			newResult := dao.RecognitionResult{
+				TaskID:   newTask.ID, // 使用新任务的ID
+				TaskType: dao.TaskType(item.TaskType),
+				Content:  contentJSON,
+			}
+			if err := newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
+				log.Printf("[%d/%d] 序列化MetaData失败: task_id=%d, error=%v", i+1, len(tasksWithResults), newTask.ID, err)
+				tx.Rollback()
+				errorCount++
+				continue
 			}
 			// 保留原始时间戳
 			if item.ResultCreatedAt != nil {