feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions

View File

@@ -1,45 +1,104 @@
package dao
import (
"encoding/json"
"gorm.io/gorm"
)
type RecognitionResult struct {
BaseModel
TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;comment:任务ID" json:"task_id"`
TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"`
Latex string `json:"latex" gorm:"column:latex;type:text;not null;default:''"`
Markdown string `json:"markdown" gorm:"column:markdown;type:text;not null;default:''"` // Markdown 格式
MathML string `json:"mathml" gorm:"column:mathml;type:text;not null;default:''"` // MathML 格式
MML string `json:"mml" gorm:"column:mml;type:text;not null;default:''"` // MML 格式
// FormulaContent 公式识别的 content 字段结构
type FormulaContent struct {
Latex string `json:"latex"`
Markdown string `json:"markdown"`
MathML string `json:"mathml"`
MML string `json:"mml"`
}
type RecognitionResultDao struct {
// PDFPageContent PDF 单页识别结果
type PDFPageContent struct {
PageNumber int `json:"page_number"`
Markdown string `json:"markdown"`
}
// ResultMetaData recognition_results.meta_data 字段结构
type ResultMetaData struct {
TotalNum int `json:"total_num"`
}
// RecognitionResult recognition_results 表模型
type RecognitionResult struct {
BaseModel
TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;index;comment:任务ID" json:"task_id"`
TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"`
MetaData string `gorm:"column:meta_data;type:json;comment:元数据" json:"meta_data"`
Content string `gorm:"column:content;type:json;comment:识别内容JSON" json:"content"`
}
// SetMetaData 序列化并写入 MetaData 字段
func (r *RecognitionResult) SetMetaData(meta ResultMetaData) error {
b, err := json.Marshal(meta)
if err != nil {
return err
}
r.MetaData = string(b)
return nil
}
// GetFormulaContent 从 Content 字段反序列化公式结果
func (r *RecognitionResult) GetFormulaContent() (*FormulaContent, error) {
var c FormulaContent
if err := json.Unmarshal([]byte(r.Content), &c); err != nil {
return nil, err
}
return &c, nil
}
// GetPDFContent 从 Content 字段反序列化 PDF 分页结果
func (r *RecognitionResult) GetPDFContent() ([]PDFPageContent, error) {
var pages []PDFPageContent
if err := json.Unmarshal([]byte(r.Content), &pages); err != nil {
return nil, err
}
return pages, nil
}
// MarshalFormulaContent 将公式结果序列化为 JSON 字符串(供写入 Content
func MarshalFormulaContent(c FormulaContent) (string, error) {
b, err := json.Marshal(c)
return string(b), err
}
// MarshalPDFContent 将 PDF 分页结果序列化为 JSON 字符串(供写入 Content
func MarshalPDFContent(pages []PDFPageContent) (string, error) {
b, err := json.Marshal(pages)
return string(b), err
}
type RecognitionResultDao struct{}
func NewRecognitionResultDao() *RecognitionResultDao {
return &RecognitionResultDao{}
}
// 模型方法
func (dao *RecognitionResultDao) Create(tx *gorm.DB, data RecognitionResult) error {
return tx.Create(&data).Error
}
func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (result *RecognitionResult, err error) {
result = &RecognitionResult{}
err = tx.Where("task_id = ?", taskID).First(result).Error
func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (*RecognitionResult, error) {
result := &RecognitionResult{}
err := tx.Where("task_id = ?", taskID).First(result).Error
if err != nil && err == gorm.ErrRecordNotFound {
return nil, nil
}
return
}
func (dao *RecognitionResultDao) GetByTaskIDs(tx *gorm.DB, taskIDs []int64) (results []*RecognitionResult, err error) {
err = tx.Where("task_id IN (?)", taskIDs).Find(&results).Error
return
return result, err
}
func (dao *RecognitionResultDao) Update(tx *gorm.DB, id int64, updates map[string]interface{}) error {
return tx.Model(&RecognitionResult{}).Where("id = ?", id).Updates(updates).Error
}
func (dao *RecognitionResultDao) GetByTaskIDs(tx *gorm.DB, taskIDs []int64) ([]*RecognitionResult, error) {
var results []*RecognitionResult
err := tx.Where("task_id IN (?)", taskIDs).Find(&results).Error
return results, err
}

View File

@@ -20,6 +20,7 @@ const (
TaskTypeText TaskType = "TEXT"
TaskTypeTable TaskType = "TABLE"
TaskTypeLayout TaskType = "LAYOUT"
TaskTypePDF TaskType = "PDF"
)
func (t TaskType) String() string {