From 1214c9144851877aa6bea2052c81b346b969fd79 Mon Sep 17 00:00:00 2001 From: yoge Date: Tue, 31 Mar 2026 22:32:14 +0800 Subject: [PATCH] refact: update list api --- .../2026-03-30-pdf-recognition-glm-ocr.md | 1175 +++++++++++++++++ internal/model/task/request.go | 6 +- internal/service/task.go | 31 - internal/storage/dao/task.go | 8 +- 4 files changed, 1181 insertions(+), 39 deletions(-) create mode 100644 docs/superpowers/plans/2026-03-30-pdf-recognition-glm-ocr.md diff --git a/docs/superpowers/plans/2026-03-30-pdf-recognition-glm-ocr.md b/docs/superpowers/plans/2026-03-30-pdf-recognition-glm-ocr.md new file mode 100644 index 0000000..a2ccaf2 --- /dev/null +++ b/docs/superpowers/plans/2026-03-30-pdf-recognition-glm-ocr.md @@ -0,0 +1,1175 @@ +# PDF Recognition Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 支持 PDF 逐页 OCR 识别(最多10页),同步重构 `recognition_results` 表为 JSON 内容结构,兼容公式识别和 PDF 识别两种场景。 + +**Architecture:** `recognition_results` 每个任务存一行:`meta_data` JSON 存元信息(`total_num`),`content` JSON 存识别内容(公式:`{latex, markdown, mml}`;PDF:`[{page_number, markdown}, ...]`)。PDF 处理链:下载 → `go-fitz` 分页渲染 → pre-hook 限前10页 → 逐页调用现有下游 OCR 接口 → 组装 JSON → 写入 DB。 + +**Tech Stack:** Go 1.20, Gin, GORM/MySQL, Redis, Aliyun OSS, `github.com/gen2brain/go-fitz` v0.24.0, 现有下游 OCR 接口 `cloud.texpixel.com` + +--- + +## 表结构设计 + +``` +recognition_results +├── id BIGINT PK +├── task_id BIGINT INDEX +├── task_type VARCHAR(16) -- FORMULA / PDF +├── meta_data JSON -- {"total_num": 1} +├── content JSON -- 见下方说明 +├── created_at DATETIME +└── updated_at DATETIME + +content 格式(按 task_type): + FORMULA: {"latex":"E=mc^2","markdown":"$$E=mc^2$$","mml":"..."} + PDF: [{"page_number":1,"markdown":"# 第一章\n..."},{"page_number":2,"markdown":"..."}] +``` + +旧字段 `latex / markdown / mathml / mml` **全部删除**,由 `content` JSON 承接。 + +--- + +## 文件变更清单 + +| 操作 | 文件路径 | 职责 | +|------|---------|------| +| Create | `migrations/pdf_recognition.sql` | ALTER recognition_results:删旧字段,加 meta_data/content JSON | +| Modify | `internal/storage/dao/task.go` | 增加 TaskTypePDF 常量 | +| Modify | `internal/storage/dao/result.go` | 重构 RecognitionResult struct;新增内容类型辅助结构;更新 DAO 方法 | +| Create | `internal/model/pdf/request.go` | PDF 识别请求/响应 DTO | +| Create | `internal/storage/cache/pdf.go` | Redis 队列操作(PDF 专用) | +| Modify | `internal/service/recognition_service.go` | 更新 processFormulaTask / GetFormualTask 使用新 JSON 格式 | +| Create | `internal/service/pdf_recognition_service.go` | PDF 识别业务逻辑 | +| Create | `api/v1/pdf/handler.go` | HTTP 处理器 | +| Modify | `api/router.go` | 注册 PDF 路由 | +| Modify | `api/v1/oss/handler.go` | 文件类型白名单加 .pdf,大小限制放宽至 50MB | +| Modify | `go.mod` / `go.sum` | 添加 go-fitz 依赖 | + +--- + +## 环境前置:安装 MuPDF(go-fitz CGo 依赖) + +```bash +# macOS +brew install mupdf + +# Ubuntu/Debian +sudo apt-get install -y libmupdf-dev + +# 验证 +pkg-config --modversion mupdf +``` + +--- + +## Task 1: 数据库迁移 — 重构 recognition_results + +**Files:** +- Create: `migrations/pdf_recognition.sql` + +- [ ] **Step 1: 创建迁移文件** + +```sql +-- migrations/pdf_recognition.sql + +-- 1. 删除旧的单字段列(已有数据可提前备份) +ALTER TABLE `recognition_results` + DROP COLUMN `latex`, + DROP COLUMN `markdown`, + DROP COLUMN `mathml`, + DROP COLUMN `mml`; + +-- 2. 增加 JSON 字段 +ALTER TABLE `recognition_results` + ADD COLUMN `meta_data` JSON DEFAULT NULL COMMENT '元数据 {"total_num":1}' AFTER `task_type`, + ADD COLUMN `content` JSON DEFAULT NULL COMMENT '识别内容 JSON' AFTER `meta_data`; +``` + +- [ ] **Step 2: 执行迁移** + +```bash +mysql -u root -p doc_ai < migrations/pdf_recognition.sql +``` + +Expected: Query OK + +- [ ] **Step 3: 验证表结构** + +```bash +mysql -u root -p doc_ai -e "DESCRIBE recognition_results;" +``` + +Expected: 字段为 id, task_id, task_type, meta_data, content, created_at, updated_at(无 latex/markdown/mathml/mml) + +- [ ] **Step 4: Commit** + +```bash +git add migrations/pdf_recognition.sql +git commit -m "feat: migrate recognition_results to JSON content schema" +``` + +--- + +## Task 2: 添加 go-fitz 依赖 + +**Files:** +- Modify: `go.mod` + +- [ ] **Step 1: 安装依赖** + +```bash +go get github.com/gen2brain/go-fitz@v0.24.0 +``` + +Expected: go: added github.com/gen2brain/go-fitz v0.24.0 + +- [ ] **Step 2: 验证** + +```bash +go build ./... +``` + +- [ ] **Step 3: Commit** + +```bash +git add go.mod go.sum +git commit -m "feat: add go-fitz for PDF page rendering" +``` + +--- + +## Task 3: 常量扩展 + +**Files:** +- Modify: `internal/storage/dao/task.go` + +- [ ] **Step 1: 添加 TaskTypePDF** + +找到 const 块,将: +```go +TaskTypeLayout TaskType = "LAYOUT" +``` +改为: +```go +TaskTypeLayout TaskType = "LAYOUT" +TaskTypePDF TaskType = "PDF" +``` + +- [ ] **Step 2: 验证** + +```bash +go build ./internal/storage/dao/... +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/storage/dao/task.go +git commit -m "feat: add TaskTypePDF constant" +``` + +--- + +## Task 4: DAO — 重构 RecognitionResult + +**Files:** +- Modify: `internal/storage/dao/result.go` + +- [ ] **Step 1: 用新 struct 完整替换 result.go 内容** + +```go +package dao + +import ( + "encoding/json" + + "gorm.io/gorm" +) + +// FormulaContent 公式识别的 content 字段结构 +type FormulaContent struct { + Latex string `json:"latex"` + Markdown string `json:"markdown"` + MathML string `json:"mathml"` + MML string `json:"mml"` +} + +// PDFPageContent PDF 单页识别结果 +type PDFPageContent struct { + PageNumber int `json:"page_number"` + Markdown string `json:"markdown"` +} + +// ResultMetaData recognition_results.meta_data 字段结构 +type ResultMetaData struct { + TotalNum int `json:"total_num"` +} + +// RecognitionResult recognition_results 表模型 +type RecognitionResult struct { + BaseModel + TaskID int64 `gorm:"column:task_id;bigint;not null;default:0;index;comment:任务ID" json:"task_id"` + TaskType TaskType `gorm:"column:task_type;varchar(16);not null;comment:任务类型;default:''" json:"task_type"` + MetaData string `gorm:"column:meta_data;type:json;comment:元数据" json:"meta_data"` + Content string `gorm:"column:content;type:json;comment:识别内容JSON" json:"content"` +} + +// SetMetaData 序列化并写入 MetaData 字段 +func (r *RecognitionResult) SetMetaData(meta ResultMetaData) error { + b, err := json.Marshal(meta) + if err != nil { + return err + } + r.MetaData = string(b) + return nil +} + +// GetFormulaContent 从 Content 字段反序列化公式结果 +func (r *RecognitionResult) GetFormulaContent() (*FormulaContent, error) { + var c FormulaContent + if err := json.Unmarshal([]byte(r.Content), &c); err != nil { + return nil, err + } + return &c, nil +} + +// GetPDFContent 从 Content 字段反序列化 PDF 分页结果 +func (r *RecognitionResult) GetPDFContent() ([]PDFPageContent, error) { + var pages []PDFPageContent + if err := json.Unmarshal([]byte(r.Content), &pages); err != nil { + return nil, err + } + return pages, nil +} + +// MarshalFormulaContent 将公式结果序列化为 JSON 字符串(供写入 Content) +func MarshalFormulaContent(c FormulaContent) (string, error) { + b, err := json.Marshal(c) + return string(b), err +} + +// MarshalPDFContent 将 PDF 分页结果序列化为 JSON 字符串(供写入 Content) +func MarshalPDFContent(pages []PDFPageContent) (string, error) { + b, err := json.Marshal(pages) + return string(b), err +} + +type RecognitionResultDao struct{} + +func NewRecognitionResultDao() *RecognitionResultDao { + return &RecognitionResultDao{} +} + +func (dao *RecognitionResultDao) Create(tx *gorm.DB, data RecognitionResult) error { + return tx.Create(&data).Error +} + +func (dao *RecognitionResultDao) GetByTaskID(tx *gorm.DB, taskID int64) (*RecognitionResult, error) { + result := &RecognitionResult{} + err := tx.Where("task_id = ?", taskID).First(result).Error + if err != nil && err == gorm.ErrRecordNotFound { + return nil, nil + } + return result, err +} + +func (dao *RecognitionResultDao) Update(tx *gorm.DB, id int64, updates map[string]interface{}) error { + return tx.Model(&RecognitionResult{}).Where("id = ?", id).Updates(updates).Error +} +``` + +- [ ] **Step 2: 验证编译** + +```bash +go build ./internal/storage/dao/... +``` + +Expected: 无报错 + +- [ ] **Step 3: Commit** + +```bash +git add internal/storage/dao/result.go +git commit -m "refactor: RecognitionResult to JSON content schema (meta_data + content)" +``` + +--- + +## Task 5: 更新公式识别 + TaskService — 适配新 JSON 格式 + +**Files:** +- Modify: `internal/service/recognition_service.go` +- Modify: `internal/service/task.go` + +> **注意**:迁移删除了 `latex/markdown/mathml/mml` 列,`task.go` 的 `GetTaskList`(:98-101)和 `ExportTask`(:151)都直接读这些字段,必须在同一个 commit 里一起更新,否则迁移后立即崩溃。 + +- [ ] **Step 1: 修改 recognition_service.go — processFormulaTask 写入** + +找到 `processFormulaTask` 内调用 `resultDao.Create` 的代码(约第542行): + +```go +// 旧代码 +err = resultDao.Create(tx, dao.RecognitionResult{ + TaskID: taskID, + TaskType: dao.TaskTypeFormula, + Latex: ocrResp.Latex, + Markdown: ocrResp.Markdown, + MathML: ocrResp.MathML, + MML: ocrResp.MML, +}) +``` + +替换为: + +```go +// 新代码 +contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{ + Latex: ocrResp.Latex, + Markdown: ocrResp.Markdown, + MathML: ocrResp.MathML, + MML: ocrResp.MML, +}) +if err != nil { + log.Error(ctx, "func", "processFormulaTask", "msg", "序列化公式内容失败", "error", err) + return err +} +result := dao.RecognitionResult{ + TaskID: taskID, + TaskType: dao.TaskTypeFormula, + Content: contentJSON, +} +if err = result.SetMetaData(dao.ResultMetaData{TotalNum: 1}); err != nil { + log.Error(ctx, "func", "processFormulaTask", "msg", "序列化MetaData失败", "error", err) + return err +} +err = resultDao.Create(tx, result) +if err != nil { + log.Error(ctx, "func", "processFormulaTask", "msg", "保存任务结果失败", "error", err) + return err +} +``` + +- [ ] **Step 2: 修改 recognition_service.go — processVLFormulaTask 写入** + +找到 `processVLFormulaTask` 内对 `resultDao.Create` / `resultDao.Update` 的调用(约第665-678行): + +创建时: +```go +contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex}) +if err != nil { + log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err) + return err +} +newResult := dao.RecognitionResult{TaskID: taskID, TaskType: dao.TaskTypeFormula, Content: contentJSON} +_ = newResult.SetMetaData(dao.ResultMetaData{TotalNum: 1}) +err = resultDao.Create(dao.DB.WithContext(ctx), newResult) +``` + +更新时: +```go +contentJSON, err := dao.MarshalFormulaContent(dao.FormulaContent{Latex: latex}) +if err != nil { + log.Error(ctx, "func", "processVLFormulaTask", "msg", "序列化公式内容失败", "error", err) + return err +} +err = resultDao.Update(dao.DB.WithContext(ctx), result.ID, map[string]interface{}{"content": contentJSON}) +``` + +- [ ] **Step 3: 修改 recognition_service.go — GetFormualTask 读取** + +找到 `GetFormualTask`(约第134行),将读取旧字段的代码: + +```go +// 旧代码:直接读 taskRet.Latex / taskRet.Markdown / taskRet.MathML / taskRet.MML +markdown := taskRet.Markdown +if markdown == "" { + markdown = fmt.Sprintf("$$%s$$", taskRet.Latex) +} +return &formula.GetFormulaTaskResponse{ + TaskNo: taskNo, + Latex: taskRet.Latex, + Markdown: markdown, + MathML: taskRet.MathML, + MML: taskRet.MML, + Status: int(task.Status), +}, nil +``` + +替换为: + +```go +// 新代码 +formulaContent, err := taskRet.GetFormulaContent() +if err != nil { + log.Error(ctx, "func", "GetFormualTask", "msg", "解析公式内容失败", "error", err) + return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err) +} +markdown := formulaContent.Markdown +if markdown == "" { + markdown = fmt.Sprintf("$$%s$$", formulaContent.Latex) +} +return &formula.GetFormulaTaskResponse{ + TaskNo: taskNo, + Latex: formulaContent.Latex, + Markdown: markdown, + MathML: formulaContent.MathML, + MML: formulaContent.MML, + Status: int(task.Status), +}, nil +``` + +- [ ] **Step 4: 修改 task.go — GetTaskList 读取结果(:91-119)** + +找到 `GetTaskList` 中组装 DTO 的代码: + +```go +// 旧代码 +var latex, markdown, mathML, mml string +recognitionResult := recognitionResultMap[item.ID] +if recognitionResult != nil { + latex = recognitionResult.Latex + markdown = recognitionResult.Markdown + mathML = recognitionResult.MathML + mml = recognitionResult.MML +} +``` + +替换为: + +```go +// 新代码:按 task_type 反序列化 content +var latex, markdown, mathML, mml string +recognitionResult := recognitionResultMap[item.ID] +if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula { + if fc, err := recognitionResult.GetFormulaContent(); err == nil { + latex = fc.Latex + markdown = fc.Markdown + mathML = fc.MathML + mml = fc.MML + } +} +// PDF 类型的 TaskListDTO 暂不展开 content(列表页只显示状态) +``` + +- [ ] **Step 5: 修改 task.go — ExportTask 读取 markdown(:140-155)** + +找到 `ExportTask` 中读取 markdown 的代码: + +```go +// 旧代码 +markdown := recognitionResult.Markdown +if markdown == "" { + log.Error(ctx, "func", "ExportTask", "msg", "markdown not found") + return nil, "", errors.New("markdown not found") +} +``` + +替换为: + +```go +// 新代码:按 task_type 解析 content +var markdown string +switch recognitionResult.TaskType { +case dao.TaskTypeFormula: + fc, err := recognitionResult.GetFormulaContent() + if err != nil || fc.Markdown == "" { + log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err) + return nil, "", errors.New("markdown not found") + } + markdown = fc.Markdown +default: + log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType) + return nil, "", errors.New("unsupported task type for export") +} +``` + +- [ ] **Step 6: 验证编译** + +```bash +go build ./internal/service/... +``` + +Expected: 无报错 + +- [ ] **Step 7: Commit** + +```bash +git add internal/service/recognition_service.go internal/service/task.go +git commit -m "refactor: adapt all recognition result reads/writes to JSON content schema" +``` + +--- + +## Task 6: Cache — PDF Redis 队列 + +**Files:** +- Create: `internal/storage/cache/pdf.go` + +- [ ] **Step 1: 创建 pdf.go** + +```go +// internal/storage/cache/pdf.go +package cache + +import ( + "context" + "strconv" +) + +const ( + PDFRecognitionTaskQueue = "pdf_recognition_queue" + PDFRecognitionDistLock = "pdf_recognition_dist_lock" +) + +func PushPDFTask(ctx context.Context, taskID int64) (int64, error) { + return RedisClient.LPush(ctx, PDFRecognitionTaskQueue, taskID).Result() +} + +func PopPDFTask(ctx context.Context) (int64, error) { + result, err := RedisClient.BRPop(ctx, 0, PDFRecognitionTaskQueue).Result() + if err != nil { + return 0, err + } + return strconv.ParseInt(result[1], 10, 64) +} + +func GetPDFDistributedLock(ctx context.Context) (bool, error) { + return RedisClient.SetNX(ctx, PDFRecognitionDistLock, "locked", DefaultLockTimeout).Result() +} +``` + +- [ ] **Step 2: 验证** + +```bash +go build ./internal/storage/cache/... +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/storage/cache/pdf.go +git commit -m "feat: add PDF recognition Redis queue" +``` + +--- + +## Task 7: Model — PDF 请求/响应 DTO + +**Files:** +- Create: `internal/model/pdf/request.go` + +- [ ] **Step 1: 创建文件** + +```go +// internal/model/pdf/request.go +package pdf + +// CreatePDFRecognitionRequest 创建PDF识别任务 +type CreatePDFRecognitionRequest struct { + FileURL string `json:"file_url" binding:"required"` + FileHash string `json:"file_hash" binding:"required"` + FileName string `json:"file_name" binding:"required"` + UserID int64 `json:"user_id"` +} + +// GetPDFTaskRequest URI 参数 +type GetPDFTaskRequest struct { + TaskNo string `uri:"task_no" binding:"required"` +} + +// CreatePDFTaskResponse 创建任务响应 +type CreatePDFTaskResponse struct { + TaskNo string `json:"task_no"` + Status int `json:"status"` +} + +// PDFPageResult 单页结果(与 dao.PDFPageContent 对应) +type PDFPageResult struct { + PageNumber int `json:"page_number"` + Markdown string `json:"markdown"` +} + +// GetPDFTaskResponse 查询任务状态和结果 +type GetPDFTaskResponse struct { + TaskNo string `json:"task_no"` + Status int `json:"status"` // 0=PENDING 1=PROCESSING 2=COMPLETED 3=FAILED + TotalPages int `json:"total_pages"` // 实际处理的页数 + Pages []PDFPageResult `json:"pages"` // status=2 时填充 +} +``` + +- [ ] **Step 2: 验证** + +```bash +go build ./internal/model/pdf/... +``` + +- [ ] **Step 3: Commit** + +```bash +git add internal/model/pdf/request.go +git commit -m "feat: add PDF recognition request/response models" +``` + +--- + +## Task 8: Service — PDFRecognitionService + +**Files:** +- Create: `internal/service/pdf_recognition_service.go` + +- [ ] **Step 1: 创建服务文件** + +```go +// internal/service/pdf_recognition_service.go +package service + +import ( + "bytes" + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "time" + + "github.com/gen2brain/go-fitz" + + "gitea.com/texpixel/document_ai/internal/model/formula" + pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf" + "gitea.com/texpixel/document_ai/internal/storage/cache" + "gitea.com/texpixel/document_ai/internal/storage/dao" + "gitea.com/texpixel/document_ai/pkg/common" + "gitea.com/texpixel/document_ai/pkg/httpclient" + "gitea.com/texpixel/document_ai/pkg/log" + "gitea.com/texpixel/document_ai/pkg/oss" + "gitea.com/texpixel/document_ai/pkg/requestid" + "gitea.com/texpixel/document_ai/pkg/utils" + "gorm.io/gorm" +) + +const ( + pdfMaxPages = 10 + pdfOCREndpoint = "https://cloud.texpixel.com:10443/doc_process/v1/image/ocr" +) + +// PDFRecognitionService 处理 PDF 识别任务 +type PDFRecognitionService struct { + db *gorm.DB + queueLimit chan struct{} + stopChan chan struct{} + httpClient *httpclient.Client +} + +func NewPDFRecognitionService() *PDFRecognitionService { + s := &PDFRecognitionService{ + db: dao.DB, + queueLimit: make(chan struct{}, 3), + stopChan: make(chan struct{}), + httpClient: httpclient.NewClient(nil), + } + + utils.SafeGo(func() { + lock, err := cache.GetPDFDistributedLock(context.Background()) + if err != nil || !lock { + log.Error(context.Background(), "func", "NewPDFRecognitionService", "msg", "获取PDF分布式锁失败") + return + } + s.processPDFQueue(context.Background()) + }) + + return s +} + +// CreatePDFTask 创建识别任务并入队 +func (s *PDFRecognitionService) CreatePDFTask(ctx context.Context, req *pdfmodel.CreatePDFRecognitionRequest) (*dao.RecognitionTask, error) { + task := &dao.RecognitionTask{ + UserID: req.UserID, + TaskUUID: utils.NewUUID(), + TaskType: dao.TaskTypePDF, + Status: dao.TaskStatusPending, + FileURL: req.FileURL, + FileName: req.FileName, + FileHash: req.FileHash, + IP: common.GetIPFromContext(ctx), + } + + if err := dao.NewRecognitionTaskDao().Create(dao.DB.WithContext(ctx), task); err != nil { + log.Error(ctx, "func", "CreatePDFTask", "msg", "创建任务失败", "error", err) + return nil, common.NewError(common.CodeDBError, "创建任务失败", err) + } + + if _, err := cache.PushPDFTask(ctx, task.ID); err != nil { + log.Error(ctx, "func", "CreatePDFTask", "msg", "推入队列失败", "error", err) + return nil, common.NewError(common.CodeSystemError, "推入队列失败", err) + } + + return task, nil +} + +// GetPDFTask 查询任务状态和结果 +func (s *PDFRecognitionService) GetPDFTask(ctx context.Context, taskNo string) (*pdfmodel.GetPDFTaskResponse, error) { + sess := dao.DB.WithContext(ctx) + task, err := dao.NewRecognitionTaskDao().GetByTaskNo(sess, taskNo) + if err != nil { + if err == gorm.ErrRecordNotFound { + return nil, common.NewError(common.CodeNotFound, "任务不存在", err) + } + return nil, common.NewError(common.CodeDBError, "查询任务失败", err) + } + + // 类型校验:防止公式任务被当成 PDF 解析 + if task.TaskType != dao.TaskTypePDF { + return nil, common.NewError(common.CodeNotFound, "任务不存在", nil) + } + + resp := &pdfmodel.GetPDFTaskResponse{ + TaskNo: taskNo, + Status: int(task.Status), + } + + if task.Status != dao.TaskStatusCompleted { + return resp, nil + } + + result, err := dao.NewRecognitionResultDao().GetByTaskID(sess, task.ID) + if err != nil || result == nil { + return nil, common.NewError(common.CodeDBError, "查询识别结果失败", err) + } + + pages, err := result.GetPDFContent() + if err != nil { + return nil, common.NewError(common.CodeSystemError, "解析识别结果失败", err) + } + + resp.TotalPages = len(pages) + for _, p := range pages { + resp.Pages = append(resp.Pages, pdfmodel.PDFPageResult{ + PageNumber: p.PageNumber, + Markdown: p.Markdown, + }) + } + + return resp, nil +} + +// processPDFQueue 持续消费队列 +func (s *PDFRecognitionService) processPDFQueue(ctx context.Context) { + for { + select { + case <-s.stopChan: + return + default: + s.processOnePDFTask(ctx) + } + } +} + +func (s *PDFRecognitionService) processOnePDFTask(ctx context.Context) { + s.queueLimit <- struct{}{} + defer func() { <-s.queueLimit }() + + taskID, err := cache.PopPDFTask(ctx) + if err != nil { + log.Error(ctx, "func", "processOnePDFTask", "msg", "获取任务失败", "error", err) + return + } + + task, err := dao.NewRecognitionTaskDao().GetTaskByID(dao.DB.WithContext(ctx), taskID) + if err != nil || task == nil { + log.Error(ctx, "func", "processOnePDFTask", "msg", "任务不存在", "task_id", taskID) + return + } + + ctx = context.WithValue(ctx, utils.RequestIDKey, task.TaskUUID) + requestid.SetRequestID(task.TaskUUID, func() { + if err := s.processPDFTask(ctx, taskID, task.FileURL); err != nil { + log.Error(ctx, "func", "processOnePDFTask", "msg", "处理PDF任务失败", "error", err) + } + }) +} + +// processPDFTask 核心处理:下载 → pre-hook → 逐页OCR → 写入DB +func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64, fileURL string) error { + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + + taskDao := dao.NewRecognitionTaskDao() + resultDao := dao.NewRecognitionResultDao() + + isSuccess := false + defer func() { + status, remark := dao.TaskStatusFailed, "任务处理失败" + if isSuccess { + status, remark = dao.TaskStatusCompleted, "" + } + _ = taskDao.Update(dao.DB.WithContext(context.Background()), + map[string]interface{}{"id": taskID}, + map[string]interface{}{"status": status, "completed_at": time.Now(), "remark": remark}, + ) + }() + + // 更新为处理中 + if err := taskDao.Update(dao.DB.WithContext(ctx), + map[string]interface{}{"id": taskID}, + map[string]interface{}{"status": dao.TaskStatusProcessing}, + ); err != nil { + return fmt.Errorf("更新任务状态失败: %w", err) + } + + // 下载 PDF + reader, err := oss.DownloadFile(ctx, fileURL) + if err != nil { + return fmt.Errorf("下载PDF失败: %w", err) + } + defer reader.Close() + + pdfBytes, err := io.ReadAll(reader) + if err != nil { + return fmt.Errorf("读取PDF数据失败: %w", err) + } + + // 打开 PDF + doc, err := fitz.NewFromMemory(pdfBytes) + if err != nil { + return fmt.Errorf("解析PDF失败: %w", err) + } + defer doc.Close() + + // pre-hook: 限制最多处理前 10 页 + totalInDoc := doc.NumPage() + processPages := totalInDoc + if processPages > pdfMaxPages { + processPages = pdfMaxPages + log.Info(ctx, "func", "processPDFTask", "msg", "PDF超过10页,只处理前10页", + "task_id", taskID, "doc_total", totalInDoc) + } + + log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF", + "task_id", taskID, "process_pages", processPages) + + // 逐页渲染 + OCR,结果收集 + var pages []dao.PDFPageContent + for pageNum := 0; pageNum < processPages; pageNum++ { + imgBytes, err := doc.ImagePNG(pageNum, 150) // 150 DPI + if err != nil { + return fmt.Errorf("渲染第%d页失败: %w", pageNum+1, err) + } + + ocrResult, err := s.callOCR(ctx, imgBytes) + if err != nil { + return fmt.Errorf("OCR第%d页失败: %w", pageNum+1, err) + } + + pages = append(pages, dao.PDFPageContent{ + PageNumber: pageNum + 1, + Markdown: ocrResult.Markdown, + }) + log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成", + "page", pageNum+1, "total", processPages) + } + + // 序列化并写入 DB(单行) + contentJSON, err := dao.MarshalPDFContent(pages) + if err != nil { + return fmt.Errorf("序列化PDF内容失败: %w", err) + } + + dbResult := dao.RecognitionResult{ + TaskID: taskID, + TaskType: dao.TaskTypePDF, + Content: contentJSON, + } + if err := dbResult.SetMetaData(dao.ResultMetaData{TotalNum: processPages}); err != nil { + return fmt.Errorf("序列化MetaData失败: %w", err) + } + if err := resultDao.Create(dao.DB.WithContext(ctx), dbResult); err != nil { + return fmt.Errorf("保存PDF结果失败: %w", err) + } + + isSuccess = true + return nil +} + +// callOCR 调用与公式识别相同的下游 OCR 接口 +func (s *PDFRecognitionService) callOCR(ctx context.Context, imgBytes []byte) (*formula.ImageOCRResponse, error) { + reqBody := map[string]string{ + "image_base64": base64.StdEncoding.EncodeToString(imgBytes), + } + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, err + } + + headers := map[string]string{ + "Content-Type": "application/json", + utils.RequestIDHeaderKey: utils.GetRequestIDFromContext(ctx), + } + + resp, err := s.httpClient.RequestWithRetry(ctx, http.MethodPost, pdfOCREndpoint, bytes.NewReader(jsonData), headers) + if err != nil { + return nil, fmt.Errorf("请求OCR接口失败: %w", err) + } + defer resp.Body.Close() + + // 下游非 2xx 视为失败,避免把错误响应 body 当成识别结果存库 + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("OCR接口返回非200状态: %d, body: %s", resp.StatusCode, string(body)) + } + + var ocrResp formula.ImageOCRResponse + if err := json.NewDecoder(resp.Body).Decode(&ocrResp); err != nil { + return nil, fmt.Errorf("解析OCR响应失败: %w", err) + } + + return &ocrResp, nil +} + +func (s *PDFRecognitionService) Stop() { + close(s.stopChan) +} +``` + +- [ ] **Step 2: 验证编译** + +```bash +go build ./internal/service/... +``` + +Expected: 无报错 + +- [ ] **Step 3: Commit** + +```bash +git add internal/service/pdf_recognition_service.go +git commit -m "feat: add PDFRecognitionService with 10-page pre-hook" +``` + +--- + +## Task 9: Handler — api/v1/pdf/handler.go + +**Files:** +- Create: `api/v1/pdf/handler.go` + +- [ ] **Step 1: 创建 handler** + +```go +// api/v1/pdf/handler.go +package pdf + +import ( + "net/http" + "path/filepath" + "strings" + + pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf" + "gitea.com/texpixel/document_ai/internal/service" + "gitea.com/texpixel/document_ai/pkg/common" + "gitea.com/texpixel/document_ai/pkg/constant" + + "github.com/gin-gonic/gin" +) + +type PDFEndpoint struct { + pdfService *service.PDFRecognitionService +} + +func NewPDFEndpoint() *PDFEndpoint { + return &PDFEndpoint{ + pdfService: service.NewPDFRecognitionService(), + } +} + +func (e *PDFEndpoint) CreateTask(c *gin.Context) { + var req pdfmodel.CreatePDFRecognitionRequest + if err := c.BindJSON(&req); err != nil { + c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "参数错误")) + return + } + req.UserID = c.GetInt64(constant.ContextUserID) + + if strings.ToLower(filepath.Ext(req.FileName)) != ".pdf" { + c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "仅支持PDF文件")) + return + } + + task, err := e.pdfService.CreatePDFTask(c, &req) + if err != nil { + c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeSystemError, err.Error())) + return + } + + c.JSON(http.StatusOK, common.SuccessResponse(c, &pdfmodel.CreatePDFTaskResponse{ + TaskNo: task.TaskUUID, + Status: int(task.Status), + })) +} + +func (e *PDFEndpoint) GetTaskStatus(c *gin.Context) { + var req pdfmodel.GetPDFTaskRequest + if err := c.ShouldBindUri(&req); err != nil { + c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "参数错误")) + return + } + + resp, err := e.pdfService.GetPDFTask(c, req.TaskNo) + if err != nil { + // 透传 BusinessError 的错误码,让 404 返回 CodeNotFound 而不是统一包成 CodeSystemError + if bizErr, ok := err.(*common.BusinessError); ok { + c.JSON(http.StatusOK, common.ErrorResponse(c, int(bizErr.Code), bizErr.Message)) + return + } + c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeSystemError, err.Error())) + return + } + + c.JSON(http.StatusOK, common.SuccessResponse(c, resp)) +} +``` + +- [ ] **Step 2: 验证** + +```bash +go build ./api/... +``` + +- [ ] **Step 3: Commit** + +```bash +git add api/v1/pdf/handler.go +git commit -m "feat: add PDF recognition HTTP handler" +``` + +--- + +## Task 10: Router + OSS Handler + +> **OSS 大小限制说明**:当前 `GetSignatureURL` handler 不做文件大小校验(没有 `file_size` 入参),大小限制由 Aliyun OSS Policy Token 的 `content-length-range` 条件控制。如需放宽 PDF 上传的大小上限,需修改 `pkg/oss` 中生成 Policy Token 的逻辑(在本 Task 范围之外)。本 Task 只处理文件类型白名单。 + +**Files:** +- Modify: `api/router.go` +- Modify: `api/v1/oss/handler.go` + +- [ ] **Step 1: 在 router.go 添加 PDF import 和路由** + +import 块添加: +```go +"gitea.com/texpixel/document_ai/api/v1/pdf" +``` + +`SetupRouter` 的 v1 块末尾添加: +```go +pdfRouter := v1.Group("/pdf", common.GetAuthMiddleware()) +{ + endpoint := pdf.NewPDFEndpoint() + pdfRouter.POST("/recognition", endpoint.CreateTask) + pdfRouter.GET("/recognition/:task_no", endpoint.GetTaskStatus) +} +``` + +- [ ] **Step 2: 在 oss/handler.go 的白名单中添加 .pdf** + +找到(`handler.go:73`): +```go +if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"}) { +``` +改为: +```go +if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ".pdf"}) { +``` + +- [ ] **Step 3: 验证整体编译** + +```bash +go build ./... +``` + +Expected: 无报错 + +- [ ] **Step 4: 冒烟测试路由** + +```bash +go run main.go & +curl -X GET http://localhost:8024/v1/pdf/recognition/fake-task-no \ + -H "Authorization: Bearer YOUR_TOKEN" +``` + +Expected: `{"code":404,"message":"任务不存在",...}` — GetByTaskNo 返回 ErrRecordNotFound → service 返回 CodeNotFound BusinessError → handler 透传错误码 + +- [ ] **Step 5: Commit** + +```bash +git add api/router.go api/v1/oss/handler.go +git commit -m "feat: register PDF routes and allow .pdf upload in OSS handler" +``` + +--- + +## 前端交互流程 + +``` +1. POST /v1/oss/signature_url { file_name: "doc.pdf", file_hash, file_size } + → { sign_url, path: "formula/uuid.pdf" } + +2. PUT sign_url (直传 PDF 到 OSS) + +3. POST /v1/pdf/recognition { file_url, file_hash, file_name: "doc.pdf" } + → { task_no: "uuid", status: 0 } + +4. GET /v1/pdf/recognition/:task_no (每3秒轮询) + → status=1 { task_no, status:1, total_pages:0, pages:[] } + +5. status=2 时: + { + "task_no": "uuid", + "status": 2, + "total_pages": 8, ← 实际处理页数(最多10) + "pages": [ + { "page_number": 1, "markdown": "# 第一章\n..." }, + { "page_number": 2, "markdown": "## 1.1\n..." } + ] + } +``` + +--- + +## 数据库样例 + +```sql +-- recognition_results 表中 PDF 任务的一行示例 +INSERT INTO recognition_results (task_id, task_type, meta_data, content) VALUES ( + 123, + 'PDF', + '{"total_num":8}', + '[{"page_number":1,"markdown":"# 第一章\n正文..."},{"page_number":2,"markdown":"## 1.1\n..."}]' +); + +-- FORMULA 任务的一行示例 +INSERT INTO recognition_results (task_id, task_type, meta_data, content) VALUES ( + 456, + 'FORMULA', + '{"total_num":1}', + '{"latex":"E=mc^2","markdown":"$$E=mc^2$$","mathml":"...","mml":""}' +); +``` + +--- + +## 自检清单 + +- [x] **Breaking change 全覆盖**: 迁移删旧列后,`recognition_service.go`(3处写/读)和 `task.go`(GetTaskList + ExportTask 2处读)在同一 commit 里全部更新,不存在中间状态崩溃窗口 +- [x] **单行存储**: PDF 所有页面的结果存为一行的 JSON array,不增加新表 +- [x] **pre-hook**: processPDFTask 开头 clamp processPages ≤ 10,写日志说明 +- [x] **OCR 接口复用**: PDF 与公式识别调用同一下游端点,请求格式(image_base64)完全相同 +- [x] **GetPDFTask 类型校验**: 获取任务后校验 TaskType == PDF,类型不符返回 CodeNotFound,防止公式任务被当 PDF 解析 +- [x] **callOCR StatusCode 检查**: 下游非 200 立即返回 error,不解析 body,防止把错误响应存为识别结果 +- [x] **Handler 错误码透传**: GetTaskStatus 检查 `*common.BusinessError`,透传 Code 字段,404 正确返回 code=404 +- [x] **meta_data.total_num**: 公式=1,PDF=实际处理页数 +- [x] **错误恢复**: defer 保证异常时任务状态更新为 FAILED +- [x] **超时**: PDF 任务 10 分钟超时(10页 × ~45秒) +- [x] **OSS 大小限制**: handler 无代码侧大小校验,限制由 OSS Policy Token 的 content-length-range 控制;本计划只扩展文件类型白名单 diff --git a/internal/model/task/request.go b/internal/model/task/request.go index 74bf68f..490ac70 100644 --- a/internal/model/task/request.go +++ b/internal/model/task/request.go @@ -8,7 +8,7 @@ type EvaluateTaskRequest struct { } type TaskListRequest struct { - TaskType string `json:"task_type" form:"task_type" binding:"required"` + TaskType string `json:"task_type" form:"task_type"` Page int `json:"page" form:"page"` PageSize int `json:"page_size" form:"page_size"` UserID int64 `json:"-"` @@ -21,10 +21,6 @@ type TaskListDTO struct { OriginURL string `json:"origin_url"` TaskType string `json:"task_type"` CreatedAt string `json:"created_at"` - Latex string `json:"latex"` - Markdown string `json:"markdown"` - MathML string `json:"mathml"` - MML string `json:"mml"` } type TaskListResponse struct { diff --git a/internal/service/task.go b/internal/service/task.go index d099b2b..ab41814 100644 --- a/internal/service/task.go +++ b/internal/service/task.go @@ -68,47 +68,16 @@ func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListReque return nil, err } - taskIDs := make([]int64, 0, len(tasks)) - for _, item := range tasks { - taskIDs = append(taskIDs, item.ID) - } - - recognitionResults, err := svc.recognitionResultDao.GetByTaskIDs(dao.DB.WithContext(ctx), taskIDs) - if err != nil { - log.Error(ctx, "func", "GetTaskList", "msg", "get recognition results failed", "error", err) - return nil, err - } - - recognitionResultMap := make(map[int64]*dao.RecognitionResult) - for _, item := range recognitionResults { - recognitionResultMap[item.TaskID] = item - } - resp := &task.TaskListResponse{ TaskList: make([]*task.TaskListDTO, 0, len(tasks)), Total: total, } for _, item := range tasks { - var latex, markdown, mathML, mml string - recognitionResult := recognitionResultMap[item.ID] - if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula { - if fc, err := recognitionResult.GetFormulaContent(); err == nil { - latex = fc.Latex - markdown = fc.Markdown - mathML = fc.MathML - mml = fc.MML - } - } - // PDF 类型的 TaskListDTO 暂不展开 content(列表页只显示状态) originURL, err := oss.GetDownloadURL(ctx, item.FileURL) if err != nil { log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err) } resp.TaskList = append(resp.TaskList, &task.TaskListDTO{ - Latex: latex, - Markdown: markdown, - MathML: mathML, - MML: mml, TaskID: item.TaskUUID, FileName: item.FileName, Status: int(item.Status), diff --git a/internal/storage/dao/task.go b/internal/storage/dao/task.go index 16bae9e..b0ae65c 100644 --- a/internal/storage/dao/task.go +++ b/internal/storage/dao/task.go @@ -89,12 +89,14 @@ func (dao *RecognitionTaskDao) GetTaskByID(tx *gorm.DB, id int64) (task *Recogni } func (dao *RecognitionTaskDao) GetTaskList(tx *gorm.DB, userID int64, taskType TaskType, page int, pageSize int) (tasks []*RecognitionTask, total int64, err error) { - offset := (page - 1) * pageSize - query := tx.Model(RecognitionTask{}).Where("user_id = ? AND task_type = ?", userID, taskType) + query := tx.Model(RecognitionTask{}).Where("user_id = ?", userID) + if taskType != "" { + query = query.Where("task_type = ?", taskType) + } err = query.Count(&total).Error if err != nil { return nil, 0, err } - err = query.Offset(offset).Limit(pageSize).Order(clause.OrderByColumn{Column: clause.Column{Name: "id"}, Desc: true}).Find(&tasks).Error + err = query.Order(clause.OrderByColumn{Column: clause.Column{Name: "id"}, Desc: true}).Offset((page - 1) * pageSize).Limit(pageSize).Find(&tasks).Error return tasks, total, err }