Files
doc_ai_backed/internal/storage/dao/result.go
yoge 9d712c921a feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 14:17:44 +08:00

105 lines
3.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package dao
import (
"encoding/json"
"gorm.io/gorm"
)
// FormulaContent 公式识别的 content 字段结构
type FormulaContent struct {
Latex string `json:"latex"`
Markdown string `json:"markdown"`
MathML string `json:"mathml"`
MML string `json:"mml"`
}
// PDFPageContent PDF 单页识别结果
type PDFPageContent struct {
PageNumber int `json:"page_number"`
Markdown string `json:"markdown"`
}
// ResultMetaData recognition_results.meta_data 字段结构
type ResultMetaData struct {
TotalNum int `json:"total_num"`
}
// RecognitionResult recognition_results 表模型
type RecognitionResult struct {
BaseModel
TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;index;comment:任务ID" json:"task_id"`
TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"`
MetaData string `gorm:"column:meta_data;type:json;comment:元数据" json:"meta_data"`
Content string `gorm:"column:content;type:json;comment:识别内容JSON" json:"content"`
}
// SetMetaData 序列化并写入 MetaData 字段
func (r *RecognitionResult) SetMetaData(meta ResultMetaData) error {
b, err := json.Marshal(meta)
if err != nil {
return err
}
r.MetaData = string(b)
return nil
}
// GetFormulaContent 从 Content 字段反序列化公式结果
func (r *RecognitionResult) GetFormulaContent() (*FormulaContent, error) {
var c FormulaContent
if err := json.Unmarshal([]byte(r.Content), &c); err != nil {
return nil, err
}
return &c, nil
}
// GetPDFContent 从 Content 字段反序列化 PDF 分页结果
func (r *RecognitionResult) GetPDFContent() ([]PDFPageContent, error) {
var pages []PDFPageContent
if err := json.Unmarshal([]byte(r.Content), &pages); err != nil {
return nil, err
}
return pages, nil
}
// MarshalFormulaContent 将公式结果序列化为 JSON 字符串(供写入 Content
func MarshalFormulaContent(c FormulaContent) (string, error) {
b, err := json.Marshal(c)
return string(b), err
}
// MarshalPDFContent 将 PDF 分页结果序列化为 JSON 字符串(供写入 Content
func MarshalPDFContent(pages []PDFPageContent) (string, error) {
b, err := json.Marshal(pages)
return string(b), err
}
type RecognitionResultDao struct{}
func NewRecognitionResultDao() *RecognitionResultDao {
return &RecognitionResultDao{}
}
func (dao *RecognitionResultDao) Create(tx *gorm.DB, data RecognitionResult) error {
return tx.Create(&data).Error
}
func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (*RecognitionResult, error) {
result := &RecognitionResult{}
err := tx.Where("task_id = ?", taskID).First(result).Error
if err != nil && err == gorm.ErrRecordNotFound {
return nil, nil
}
return result, err
}
func (dao *RecognitionResultDao) Update(tx *gorm.DB, id int64, updates map[string]interface{}) error {
return tx.Model(&RecognitionResult{}).Where("id = ?", id).Updates(updates).Error
}
func (dao *RecognitionResultDao) GetByTaskIDs(tx *gorm.DB, taskIDs []int64) ([]*RecognitionResult, error) {
var results []*RecognitionResult
err := tx.Where("task_id IN (?)", taskIDs).Find(&results).Error
return results, err
}