feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -195,12 +195,27 @@ func migrateData(testDB, prodDB *gorm.DB) error {
|
||||
mathml = *item.MathML
|
||||
}
|
||||
|
||||
newResult := dao.RecognitionResult{
|
||||
TaskID: newTask.ID, // 使用新任务的ID
|
||||
TaskType: dao.TaskType(item.TaskType),
|
||||
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
|
||||
Latex: latex,
|
||||
Markdown: markdown,
|
||||
MathML: mathml,
|
||||
})
|
||||
if err != nil {
|
||||
log.Printf("[%d/%d] 序列化公式内容失败: task_id=%d, error=%v", i+1, len(tasksWithResults), newTask.ID, err)
|
||||
tx.Rollback()
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
newResult := dao.RecognitionResult{
|
||||
TaskID: newTask.ID, // 使用新任务的ID
|
||||
TaskType: dao.TaskType(item.TaskType),
|
||||
Content: contentJSON,
|
||||
}
|
||||
if err := newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
|
||||
log.Printf("[%d/%d] 序列化MetaData失败: task_id=%d, error=%v", i+1, len(tasksWithResults), newTask.ID, err)
|
||||
tx.Rollback()
|
||||
errorCount++
|
||||
continue
|
||||
}
|
||||
// 保留原始时间戳
|
||||
if item.ResultCreatedAt != nil {
|
||||
|
||||
Reference in New Issue
Block a user