Files
doc_ai_backed/internal/service/task.go
yoge 9d712c921a feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
  replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
  rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
  page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-31 14:17:44 +08:00

219 lines
7.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"strings"
"gitea.com/texpixel/document_ai/internal/model/task"
"gitea.com/texpixel/document_ai/internal/storage/dao"
"gitea.com/texpixel/document_ai/pkg/log"
"gitea.com/texpixel/document_ai/pkg/oss"
)
type TaskService struct {
recognitionTaskDao *dao.RecognitionTaskDao
evaluateTaskDao *dao.EvaluateTaskDao
recognitionResultDao *dao.RecognitionResultDao
}
func NewTaskService() *TaskService {
return &TaskService{
recognitionTaskDao: dao.NewRecognitionTaskDao(),
evaluateTaskDao: dao.NewEvaluateTaskDao(),
recognitionResultDao: dao.NewRecognitionResultDao(),
}
}
func (svc *TaskService) EvaluateTask(ctx context.Context, req *task.EvaluateTaskRequest) error {
task, err := svc.recognitionTaskDao.GetByTaskNo(dao.DB.WithContext(ctx), req.TaskNo)
if err != nil {
log.Error(ctx, "func", "EvaluateTask", "msg", "get task by task no failed", "error", err)
return err
}
if task == nil {
log.Error(ctx, "func", "EvaluateTask", "msg", "task not found")
return errors.New("task not found")
}
if task.Status != dao.TaskStatusCompleted {
log.Error(ctx, "func", "EvaluateTask", "msg", "task not finished")
return errors.New("task not finished")
}
evaluateTask := &dao.EvaluateTask{
TaskID: task.ID,
Satisfied: req.Satisfied,
Feedback: req.Feedback,
Comment: strings.Join(req.Suggestion, ","),
}
err = svc.evaluateTaskDao.Create(dao.DB.WithContext(ctx), evaluateTask)
if err != nil {
log.Error(ctx, "func", "EvaluateTask", "msg", "create evaluate task failed", "error", err)
return err
}
return nil
}
func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListRequest) (*task.TaskListResponse, error) {
tasks, total, err := svc.recognitionTaskDao.GetTaskList(dao.DB.WithContext(ctx), req.UserID, dao.TaskType(req.TaskType), req.Page, req.PageSize)
if err != nil {
log.Error(ctx, "func", "GetTaskList", "msg", "get task list failed", "error", err)
return nil, err
}
taskIDs := make([]int64, 0, len(tasks))
for _, item := range tasks {
taskIDs = append(taskIDs, item.ID)
}
recognitionResults, err := svc.recognitionResultDao.GetByTaskIDs(dao.DB.WithContext(ctx), taskIDs)
if err != nil {
log.Error(ctx, "func", "GetTaskList", "msg", "get recognition results failed", "error", err)
return nil, err
}
recognitionResultMap := make(map[int64]*dao.RecognitionResult)
for _, item := range recognitionResults {
recognitionResultMap[item.TaskID] = item
}
resp := &task.TaskListResponse{
TaskList: make([]*task.TaskListDTO, 0, len(tasks)),
Total: total,
}
for _, item := range tasks {
var latex, markdown, mathML, mml string
recognitionResult := recognitionResultMap[item.ID]
if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula {
if fc, err := recognitionResult.GetFormulaContent(); err == nil {
latex = fc.Latex
markdown = fc.Markdown
mathML = fc.MathML
mml = fc.MML
}
}
// PDF 类型的 TaskListDTO 暂不展开 content列表页只显示状态
originURL, err := oss.GetDownloadURL(ctx, item.FileURL)
if err != nil {
log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err)
}
resp.TaskList = append(resp.TaskList, &task.TaskListDTO{
Latex: latex,
Markdown: markdown,
MathML: mathML,
MML: mml,
TaskID: item.TaskUUID,
FileName: item.FileName,
Status: int(item.Status),
OriginURL: originURL,
TaskType: item.TaskType.String(),
CreatedAt: item.CreatedAt.Format("2006-01-02 15:04:05"),
})
}
return resp, nil
}
func (svc *TaskService) ExportTask(ctx context.Context, req *task.ExportTaskRequest) ([]byte, string, error) {
recognitionTask, err := svc.recognitionTaskDao.GetByTaskNo(dao.DB.WithContext(ctx), req.TaskNo)
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "get task by task id failed", "error", err)
return nil, "", err
}
if recognitionTask == nil {
log.Error(ctx, "func", "ExportTask", "msg", "task not found")
return nil, "", errors.New("task not found")
}
if recognitionTask.Status != dao.TaskStatusCompleted {
log.Error(ctx, "func", "ExportTask", "msg", "task not finished")
return nil, "", errors.New("task not finished")
}
recognitionResult, err := svc.recognitionResultDao.GetByTaskID(dao.DB.WithContext(ctx), recognitionTask.ID)
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "get recognition result by task id failed", "error", err)
return nil, "", err
}
if recognitionResult == nil {
log.Error(ctx, "func", "ExportTask", "msg", "recognition result not found")
return nil, "", errors.New("recognition result not found")
}
var markdown string
switch recognitionResult.TaskType {
case dao.TaskTypeFormula:
fc, err := recognitionResult.GetFormulaContent()
if err != nil || fc.Markdown == "" {
log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err)
return nil, "", errors.New("markdown not found")
}
markdown = fc.Markdown
default:
log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType)
return nil, "", errors.New("unsupported task type for export")
}
// 获取文件名(去掉扩展名)
filename := strings.TrimSuffix(recognitionTask.FileName, "."+strings.ToLower(strings.Split(recognitionTask.FileName, ".")[len(strings.Split(recognitionTask.FileName, "."))-1]))
if filename == "" {
filename = "texpixel"
}
// 构建 JSON 请求体
requestBody := map[string]string{
"markdown": markdown,
"filename": filename,
}
jsonData, err := json.Marshal(requestBody)
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "json marshal failed", "error", err)
return nil, "", err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://cloud.texpixel.com:10443/doc_process/v1/convert/file", bytes.NewReader(jsonData))
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "create http request failed", "error", err)
return nil, "", err
}
httpReq.Header.Set("Content-Type", "application/json")
client := &http.Client{}
resp, err := client.Do(httpReq)
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "http request failed", "error", err)
return nil, "", err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
log.Error(ctx, "func", "ExportTask", "msg", "export service returned non-200",
"status", resp.StatusCode,
"body", string(respBody),
"markdown_len", len(markdown),
"filename", filename,
)
return nil, "", fmt.Errorf("export service returned status: %d", resp.StatusCode)
}
fileData, err := io.ReadAll(resp.Body)
if err != nil {
log.Error(ctx, "func", "ExportTask", "msg", "read response body failed", "error", err)
return nil, "", err
}
// 新接口只返回 DOCX 格式
contentType := "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
return fileData, contentType, nil
}