feat: add PDF document recognition with 10-page pre-hook

- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-31 14:17:44 +08:00
parent 876e64366b
commit 9d712c921a
14 changed files with 760 additions and 67 deletions

View File

@@ -0,0 +1,34 @@
package pdf
// CreatePDFRecognitionRequest 创建PDF识别任务
type CreatePDFRecognitionRequest struct {
FileURL string `json:"file_url" binding:"required"`
FileHash string `json:"file_hash" binding:"required"`
FileName string `json:"file_name" binding:"required"`
UserID int64 `json:"user_id"`
}
// GetPDFTaskRequest URI 参数
type GetPDFTaskRequest struct {
TaskNo string `uri:"task_no" binding:"required"`
}
// CreatePDFTaskResponse 创建任务响应
type CreatePDFTaskResponse struct {
TaskNo string `json:"task_no"`
Status int `json:"status"`
}
// PDFPageResult 单页结果
type PDFPageResult struct {
PageNumber int `json:"page_number"`
Markdown string `json:"markdown"`
}
// GetPDFTaskResponse 查询任务状态和结果
type GetPDFTaskResponse struct {
TaskNo string `json:"task_no"`
Status int `json:"status"`
TotalPages int `json:"total_pages"`
Pages []PDFPageResult `json:"pages"`
}

View File

@@ -0,0 +1,343 @@
package service
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"sort"
"time"
pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf"
"gitea.com/texpixel/document_ai/internal/storage/cache"
"gitea.com/texpixel/document_ai/internal/storage/dao"
"gitea.com/texpixel/document_ai/pkg/common"
"gitea.com/texpixel/document_ai/pkg/httpclient"
"gitea.com/texpixel/document_ai/pkg/log"
"gitea.com/texpixel/document_ai/pkg/oss"
"gitea.com/texpixel/document_ai/pkg/requestid"
"gitea.com/texpixel/document_ai/pkg/utils"
"gorm.io/gorm"
"gitea.com/texpixel/document_ai/internal/model/formula"
)
const (
pdfMaxPages = 10
pdfOCREndpoint = "https://cloud.texpixel.com:10443/doc_process/v1/image/ocr"
)
// PDFRecognitionService 处理 PDF 识别任务
type PDFRecognitionService struct {
db *gorm.DB
queueLimit chan struct{}
stopChan chan struct{}
httpClient *httpclient.Client
}
func NewPDFRecognitionService() *PDFRecognitionService {
s := &PDFRecognitionService{
db: dao.DB,
queueLimit: make(chan struct{}, 3),
stopChan: make(chan struct{}),
httpClient: httpclient.NewClient(nil),
}
utils.SafeGo(func() {
lock, err := cache.GetPDFDistributedLock(context.Background())
if err != nil || !lock {
log.Error(context.Background(), "func", "NewPDFRecognitionService", "msg", "获取PDF分布式锁失败")
return
}
s.processPDFQueue(context.Background())
})
return s
}
// CreatePDFTask 创建识别任务并入队
func (s *PDFRecognitionService) CreatePDFTask(ctx context.Context, req *pdfmodel.CreatePDFRecognitionRequest) (*dao.RecognitionTask, error) {
task := &dao.RecognitionTask{
UserID: req.UserID,
TaskUUID: utils.NewUUID(),
TaskType: dao.TaskTypePDF,
Status: dao.TaskStatusPending,
FileURL: req.FileURL,
FileName: req.FileName,
FileHash: req.FileHash,
IP: common.GetIPFromContext(ctx),
}
if err := dao.NewRecognitionTaskDao().Create(dao.DB.WithContext(ctx), task); err != nil {
log.Error(ctx, "func", "CreatePDFTask", "msg", "创建任务失败", "error", err)
return nil, common.NewError(common.CodeDBError, "创建任务失败", err)
}
if _, err := cache.PushPDFTask(ctx, task.ID); err != nil {
log.Error(ctx, "func", "CreatePDFTask", "msg", "推入队列失败", "error", err)
return nil, common.NewError(common.CodeSystemError, "推入队列失败", err)
}
return task, nil
}
// GetPDFTask 查询任务状态和结果
func (s *PDFRecognitionService) GetPDFTask(ctx context.Context, taskNo string) (*pdfmodel.GetPDFTaskResponse, error) {
sess := dao.DB.WithContext(ctx)
task, err := dao.NewRecognitionTaskDao().GetByTaskNo(sess, taskNo)
if err != nil {
if err == gorm.ErrRecordNotFound {
return nil, common.NewError(common.CodeNotFound, "任务不存在", err)
}
return nil, common.NewError(common.CodeDBError, "查询任务失败", err)
}
// 类型校验:防止公式任务被当成 PDF 解析
if task.TaskType != dao.TaskTypePDF {
return nil, common.NewError(common.CodeNotFound, "任务不存在", nil)
}
resp := &pdfmodel.GetPDFTaskResponse{
TaskNo: taskNo,
Status: int(task.Status),
}
if task.Status != dao.TaskStatusCompleted {
return resp, nil
}
result, err := dao.NewRecognitionResultDao().GetByTaskID(sess, task.ID)
if err != nil || result == nil {
return nil, common.NewError(common.CodeDBError, "查询识别结果失败", err)
}
pages, err := result.GetPDFContent()
if err != nil {
return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err)
}
resp.TotalPages = len(pages)
for _, p := range pages {
resp.Pages = append(resp.Pages, pdfmodel.PDFPageResult{
PageNumber: p.PageNumber,
Markdown: p.Markdown,
})
}
return resp, nil
}
// processPDFQueue 持续消费队列
func (s *PDFRecognitionService) processPDFQueue(ctx context.Context) {
for {
select {
case <-s.stopChan:
return
default:
s.processOnePDFTask(ctx)
}
}
}
func (s *PDFRecognitionService) processOnePDFTask(ctx context.Context) {
s.queueLimit <- struct{}{}
defer func() { <-s.queueLimit }()
taskID, err := cache.PopPDFTask(ctx)
if err != nil {
log.Error(ctx, "func", "processOnePDFTask", "msg", "获取任务失败", "error", err)
return
}
task, err := dao.NewRecognitionTaskDao().GetTaskByID(dao.DB.WithContext(ctx), taskID)
if err != nil || task == nil {
log.Error(ctx, "func", "processOnePDFTask", "msg", "任务不存在", "task_id", taskID)
return
}
ctx = context.WithValue(ctx, utils.RequestIDKey, task.TaskUUID)
requestid.SetRequestID(task.TaskUUID, func() {
if err := s.processPDFTask(ctx, taskID, task.FileURL); err != nil {
log.Error(ctx, "func", "processOnePDFTask", "msg", "处理PDF任务失败", "error", err)
}
})
}
// processPDFTask 核心处理:下载 → pre-hook → 逐页OCR → 写入DB
func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64, fileURL string) error {
ctx, cancel := context.WithTimeout(ctx, 10*time.Minute)
defer cancel()
taskDao := dao.NewRecognitionTaskDao()
resultDao := dao.NewRecognitionResultDao()
isSuccess := false
defer func() {
status, remark := dao.TaskStatusFailed, "任务处理失败"
if isSuccess {
status, remark = dao.TaskStatusCompleted, ""
}
_ = taskDao.Update(dao.DB.WithContext(context.Background()),
map[string]interface{}{"id": taskID},
map[string]interface{}{"status": status, "completed_at": time.Now(), "remark": remark},
)
}()
// 更新为处理中
if err := taskDao.Update(dao.DB.WithContext(ctx),
map[string]interface{}{"id": taskID},
map[string]interface{}{"status": dao.TaskStatusProcessing},
); err != nil {
return fmt.Errorf("更新任务状态失败: %w", err)
}
// 下载 PDF
reader, err := oss.DownloadFile(ctx, fileURL)
if err != nil {
return fmt.Errorf("下载PDF失败: %w", err)
}
defer reader.Close()
pdfBytes, err := io.ReadAll(reader)
if err != nil {
return fmt.Errorf("读取PDF数据失败: %w", err)
}
// pre-hook: 用 pdftoppm 渲染前 pdfMaxPages 页为 PNG
pageImages, err := renderPDFPages(ctx, pdfBytes, pdfMaxPages)
if err != nil {
return fmt.Errorf("渲染PDF页面失败: %w", err)
}
processPages := len(pageImages)
log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF",
"task_id", taskID, "process_pages", processPages)
// 逐页 OCR结果收集
var pages []dao.PDFPageContent
for i, imgBytes := range pageImages {
ocrResult, err := s.callOCR(ctx, imgBytes)
if err != nil {
return fmt.Errorf("OCR第%d页失败: %w", i+1, err)
}
pages = append(pages, dao.PDFPageContent{
PageNumber: i + 1,
Markdown: ocrResult.Markdown,
})
log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成",
"page", i+1, "total", processPages)
}
// 序列化并写入 DB单行
contentJSON, err := dao.MarshalPDFContent(pages)
if err != nil {
return fmt.Errorf("序列化PDF内容失败: %w", err)
}
dbResult := dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypePDF,
Content: contentJSON,
}
if err := dbResult.SetMetaData(dao.ResultMetaData{TotalNum: processPages}); err != nil {
return fmt.Errorf("序列化MetaData失败: %w", err)
}
if err := resultDao.Create(dao.DB.WithContext(ctx), dbResult); err != nil {
return fmt.Errorf("保存PDF结果失败: %w", err)
}
isSuccess = true
return nil
}
// renderPDFPages 使用 pdftoppm 将 PDF 渲染为 PNG 字节切片,最多渲染 maxPages 页
func renderPDFPages(ctx context.Context, pdfBytes []byte, maxPages int) ([][]byte, error) {
tmpDir, err := os.MkdirTemp("", "pdf-ocr-*")
if err != nil {
return nil, fmt.Errorf("创建临时目录失败: %w", err)
}
defer os.RemoveAll(tmpDir)
pdfPath := filepath.Join(tmpDir, "input.pdf")
if err := os.WriteFile(pdfPath, pdfBytes, 0600); err != nil {
return nil, fmt.Errorf("写入临时PDF失败: %w", err)
}
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.CommandContext(ctx, "pdftoppm",
"-r", "150",
"-png",
"-l", fmt.Sprintf("%d", maxPages),
pdfPath,
outPrefix,
)
if out, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("pdftoppm失败: %w, output: %s", err, string(out))
}
files, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if err != nil {
return nil, fmt.Errorf("查找渲染输出文件失败: %w", err)
}
if len(files) == 0 {
return nil, fmt.Errorf("pdftoppm未输出任何页面")
}
sort.Strings(files)
pages := make([][]byte, 0, len(files))
for _, f := range files {
data, err := os.ReadFile(f)
if err != nil {
return nil, fmt.Errorf("读取页面图片失败: %w", err)
}
pages = append(pages, data)
}
return pages, nil
}
// callOCR 调用与公式识别相同的下游 OCR 接口
func (s *PDFRecognitionService) callOCR(ctx context.Context, imgBytes []byte) (*formula.ImageOCRResponse, error) {
reqBody := map[string]string{
"image_base64": base64.StdEncoding.EncodeToString(imgBytes),
}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return nil, err
}
headers := map[string]string{
"Content-Type": "application/json",
utils.RequestIDHeaderKey: utils.GetRequestIDFromContext(ctx),
}
resp, err := s.httpClient.RequestWithRetry(ctx, http.MethodPost, pdfOCREndpoint, bytes.NewReader(jsonData), headers)
if err != nil {
return nil, fmt.Errorf("请求OCR接口失败: %w", err)
}
defer resp.Body.Close()
// 下游非 2xx 视为失败,避免把错误响应 body 当成识别结果存库
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("OCR接口返回非200状态: %d, body: %s", resp.StatusCode, string(body))
}
var ocrResp formula.ImageOCRResponse
if err := json.NewDecoder(resp.Body).Decode(&ocrResp); err != nil {
return nil, fmt.Errorf("解析OCR响应失败: %w", err)
}
return &ocrResp, nil
}
func (s *PDFRecognitionService) Stop() {
close(s.stopChan)
}

View File

@@ -169,18 +169,21 @@ func (s *RecognitionService) GetFormualTask(ctx context.Context, taskNo string)
return nil, common.NewError(common.CodeDBError, "查询任务结果失败", err)
}
// 构建 Markdown 格式
markdown := taskRet.Markdown
if markdown == "" {
markdown = fmt.Sprintf("$$%s$$", taskRet.Latex)
formulaContent, err := taskRet.GetFormulaContent()
if err != nil {
log.Error(ctx, "func", "GetFormualTask", "msg", "解析公式内容失败", "error", err)
return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err)
}
markdown := formulaContent.Markdown
if markdown == "" {
markdown = fmt.Sprintf("$$%s$$", formulaContent.Latex)
}
return &formula.GetFormulaTaskResponse{
TaskNo: taskNo,
Latex: taskRet.Latex,
Latex: formulaContent.Latex,
Markdown: markdown,
MathML: taskRet.MathML,
MML: taskRet.MML,
MathML: formulaContent.MathML,
MML: formulaContent.MML,
Status: int(task.Status),
}, nil
}
@@ -539,14 +542,26 @@ func (s *RecognitionService) processFormulaTask(ctx context.Context, taskID int6
log.Error(ctx, "func", "processFormulaTask", "msg", "解析响应JSON失败", "error", err)
return err
}
err = resultDao.Create(tx, dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: ocrResp.Latex,
Markdown: ocrResp.Markdown,
MathML: ocrResp.MathML,
MML: ocrResp.MML,
})
if err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
result := dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
Content: contentJSON,
}
if err = result.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "序列化MetaData失败", "error", err)
return err
}
err = resultDao.Create(tx, result)
if err != nil {
log.Error(ctx, "func", "processFormulaTask", "msg", "保存任务结果失败", "error", err)
return err
@@ -662,15 +677,25 @@ func (s *RecognitionService) processVLFormulaTask(ctx context.Context, taskID in
return err
}
if result == nil {
formulaRes := &dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Latex: latex}
err = resultDao.Create(dao.DB.WithContext(ctx), *formulaRes)
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
result.Latex = latex
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"latex": latex})
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"content": contentJSON})
if err != nil {
log.Error(ctx, "func", "processVLFormulaTask", "msg", "更新任务结果失败", "error", err)
return err
@@ -851,23 +876,35 @@ func (s *RecognitionService) processMathpixTask(ctx context.Context, taskID int6
if result == nil {
// 创建新结果
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: mathpixResp.LatexStyled,
Markdown: mathpixResp.Text,
MathML: mathpixResp.GetMathML(),
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
// 更新现有结果
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Latex: mathpixResp.LatexStyled,
Markdown: mathpixResp.Text,
MathML: mathpixResp.GetMathML(),
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
"latex": mathpixResp.LatexStyled,
"markdown": mathpixResp.Text,
"mathml": mathpixResp.GetMathML(),
"content": contentJSON,
})
if err != nil {
log.Error(ctx, "func", "processMathpixTask", "msg", "更新任务结果失败", "error", err)
@@ -1027,23 +1064,35 @@ func (s *RecognitionService) processBaiduOCRTask(ctx context.Context, taskID int
if result == nil {
// 创建新结果
err = resultDao.Create(dao.DB.WithContext(ctx), dao.RecognitionResult{
TaskID: taskID,
TaskType: dao.TaskTypeFormula,
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Markdown: markdownResult,
Latex: latex,
MathML: mml,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
return err
}
newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON}
_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1})
err = resultDao.Create(dao.DB.WithContext(ctx), newResult)
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "创建任务结果失败", "error", err)
return err
}
} else {
// 更新现有结果
contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{
Markdown: markdownResult,
Latex: latex,
MathML: mml,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "序列化公式内容失败", "error", err)
return err
}
err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{
"markdown": markdownResult,
"latex": latex,
"mathml": mml,
"content": contentJSON,
})
if err != nil {
log.Error(ctx, "func", "processBaiduOCRTask", "msg", "更新任务结果失败", "error", err)

View File

@@ -89,17 +89,17 @@ func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListReque
Total: total,
}
for _, item := range tasks {
var latex string
var markdown string
var mathML string
var mml string
var latex, markdown, mathML, mml string
recognitionResult := recognitionResultMap[item.ID]
if recognitionResult != nil {
latex = recognitionResult.Latex
markdown = recognitionResult.Markdown
mathML = recognitionResult.MathML
mml = recognitionResult.MML
if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula {
if fc, err := recognitionResult.GetFormulaContent(); err == nil {
latex = fc.Latex
markdown = fc.Markdown
mathML = fc.MathML
mml = fc.MML
}
}
// PDF 类型的 TaskListDTO 暂不展开 content列表页只显示状态
originURL, err := oss.GetDownloadURL(ctx, item.FileURL)
if err != nil {
log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err)
@@ -148,10 +148,18 @@ func (svc *TaskService) ExportTask(ctx context.Context, req *task.ExportTaskRequ
return nil, "", errors.New("recognition result not found")
}
markdown := recognitionResult.Markdown
if markdown == "" {
log.Error(ctx, "func", "ExportTask", "msg", "markdown not found")
return nil, "", errors.New("markdown not found")
var markdown string
switch recognitionResult.TaskType {
case dao.TaskTypeFormula:
fc, err := recognitionResult.GetFormulaContent()
if err != nil || fc.Markdown == "" {
log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err)
return nil, "", errors.New("markdown not found")
}
markdown = fc.Markdown
default:
log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType)
return nil, "", errors.New("unsupported task type for export")
}
// 获取文件名(去掉扩展名)

27
internal/storage/cache/pdf.go vendored Normal file
View File

@@ -0,0 +1,27 @@
package cache
import (
"context"
"strconv"
)
const (
PDFRecognitionTaskQueue = "pdf_recognition_queue"
PDFRecognitionDistLock = "pdf_recognition_dist_lock"
)
func PushPDFTask(ctx context.Context, taskID int64) (int64, error) {
return RedisClient.LPush(ctx, PDFRecognitionTaskQueue, taskID).Result()
}
func PopPDFTask(ctx context.Context) (int64, error) {
result, err := RedisClient.BRPop(ctx, 0, PDFRecognitionTaskQueue).Result()
if err != nil {
return 0, err
}
return strconv.ParseInt(result[1], 10, 64)
}
func GetPDFDistributedLock(ctx context.Context) (bool, error) {
return RedisClient.SetNX(ctx, PDFRecognitionDistLock, "locked", DefaultLockTimeout).Result()
}

View File

@@ -1,45 +1,104 @@
package dao
import (
"encoding/json"
"gorm.io/gorm"
)
type RecognitionResult struct {
BaseModel
TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;comment:任务ID" json:"task_id"`
TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"`
Latex string `json:"latex" gorm:"column:latex;type:text;not null;default:''"`
Markdown string `json:"markdown" gorm:"column:markdown;type:text;not null;default:''"` // Markdown 格式
MathML string `json:"mathml" gorm:"column:mathml;type:text;not null;default:''"` // MathML 格式
MML string `json:"mml" gorm:"column:mml;type:text;not null;default:''"` // MML 格式
// FormulaContent 公式识别的 content 字段结构
type FormulaContent struct {
Latex string `json:"latex"`
Markdown string `json:"markdown"`
MathML string `json:"mathml"`
MML string `json:"mml"`
}
type RecognitionResultDao struct {
// PDFPageContent PDF 单页识别结果
type PDFPageContent struct {
PageNumber int `json:"page_number"`
Markdown string `json:"markdown"`
}
// ResultMetaData recognition_results.meta_data 字段结构
type ResultMetaData struct {
TotalNum int `json:"total_num"`
}
// RecognitionResult recognition_results 表模型
type RecognitionResult struct {
BaseModel
TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;index;comment:任务ID" json:"task_id"`
TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"`
MetaData string `gorm:"column:meta_data;type:json;comment:元数据" json:"meta_data"`
Content string `gorm:"column:content;type:json;comment:识别内容JSON" json:"content"`
}
// SetMetaData 序列化并写入 MetaData 字段
func (r *RecognitionResult) SetMetaData(meta ResultMetaData) error {
b, err := json.Marshal(meta)
if err != nil {
return err
}
r.MetaData = string(b)
return nil
}
// GetFormulaContent 从 Content 字段反序列化公式结果
func (r *RecognitionResult) GetFormulaContent() (*FormulaContent, error) {
var c FormulaContent
if err := json.Unmarshal([]byte(r.Content), &c); err != nil {
return nil, err
}
return &c, nil
}
// GetPDFContent 从 Content 字段反序列化 PDF 分页结果
func (r *RecognitionResult) GetPDFContent() ([]PDFPageContent, error) {
var pages []PDFPageContent
if err := json.Unmarshal([]byte(r.Content), &pages); err != nil {
return nil, err
}
return pages, nil
}
// MarshalFormulaContent 将公式结果序列化为 JSON 字符串(供写入 Content
func MarshalFormulaContent(c FormulaContent) (string, error) {
b, err := json.Marshal(c)
return string(b), err
}
// MarshalPDFContent 将 PDF 分页结果序列化为 JSON 字符串(供写入 Content
func MarshalPDFContent(pages []PDFPageContent) (string, error) {
b, err := json.Marshal(pages)
return string(b), err
}
type RecognitionResultDao struct{}
func NewRecognitionResultDao() *RecognitionResultDao {
return &RecognitionResultDao{}
}
// 模型方法
func (dao *RecognitionResultDao) Create(tx *gorm.DB, data RecognitionResult) error {
return tx.Create(&data).Error
}
func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (result *RecognitionResult, err error) {
result = &RecognitionResult{}
err = tx.Where("task_id = ?", taskID).First(result).Error
func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (*RecognitionResult, error) {
result := &RecognitionResult{}
err := tx.Where("task_id = ?", taskID).First(result).Error
if err != nil && err == gorm.ErrRecordNotFound {
return nil, nil
}
return
}
func (dao *RecognitionResultDao) GetByTaskIDs(tx *gorm.DB, taskIDs []int64) (results []*RecognitionResult, err error) {
err = tx.Where("task_id IN (?)", taskIDs).Find(&results).Error
return
return result, err
}
func (dao *RecognitionResultDao) Update(tx *gorm.DB, id int64, updates map[string]interface{}) error {
return tx.Model(&RecognitionResult{}).Where("id = ?", id).Updates(updates).Error
}
func (dao *RecognitionResultDao) GetByTaskIDs(tx *gorm.DB, taskIDs []int64) ([]*RecognitionResult, error) {
var results []*RecognitionResult
err := tx.Where("task_id IN (?)", taskIDs).Find(&results).Error
return results, err
}

View File

@@ -20,6 +20,7 @@ const (
TaskTypeText TaskType = "TEXT"
TaskTypeTable TaskType = "TABLE"
TaskTypeLayout TaskType = "LAYOUT"
TaskTypePDF TaskType = "PDF"
)
func (t TaskType) String() string {