- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
219 lines
7.0 KiB
Go
219 lines
7.0 KiB
Go
package service
|
||
|
||
import (
|
||
"bytes"
|
||
"context"
|
||
"encoding/json"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"net/http"
|
||
"strings"
|
||
|
||
"gitea.com/texpixel/document_ai/internal/model/task"
|
||
"gitea.com/texpixel/document_ai/internal/storage/dao"
|
||
"gitea.com/texpixel/document_ai/pkg/log"
|
||
"gitea.com/texpixel/document_ai/pkg/oss"
|
||
)
|
||
|
||
type TaskService struct {
|
||
recognitionTaskDao *dao.RecognitionTaskDao
|
||
evaluateTaskDao *dao.EvaluateTaskDao
|
||
recognitionResultDao *dao.RecognitionResultDao
|
||
}
|
||
|
||
func NewTaskService() *TaskService {
|
||
return &TaskService{
|
||
recognitionTaskDao: dao.NewRecognitionTaskDao(),
|
||
evaluateTaskDao: dao.NewEvaluateTaskDao(),
|
||
recognitionResultDao: dao.NewRecognitionResultDao(),
|
||
}
|
||
}
|
||
|
||
func (svc *TaskService) EvaluateTask(ctx context.Context, req *task.EvaluateTaskRequest) error {
|
||
task, err := svc.recognitionTaskDao.GetByTaskNo(dao.DB.WithContext(ctx), req.TaskNo)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "EvaluateTask", "msg", "get task by task no failed", "error", err)
|
||
return err
|
||
}
|
||
|
||
if task == nil {
|
||
log.Error(ctx, "func", "EvaluateTask", "msg", "task not found")
|
||
return errors.New("task not found")
|
||
}
|
||
if task.Status != dao.TaskStatusCompleted {
|
||
log.Error(ctx, "func", "EvaluateTask", "msg", "task not finished")
|
||
return errors.New("task not finished")
|
||
}
|
||
|
||
evaluateTask := &dao.EvaluateTask{
|
||
TaskID: task.ID,
|
||
Satisfied: req.Satisfied,
|
||
Feedback: req.Feedback,
|
||
Comment: strings.Join(req.Suggestion, ","),
|
||
}
|
||
err = svc.evaluateTaskDao.Create(dao.DB.WithContext(ctx), evaluateTask)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "EvaluateTask", "msg", "create evaluate task failed", "error", err)
|
||
return err
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListRequest) (*task.TaskListResponse, error) {
|
||
tasks, total, err := svc.recognitionTaskDao.GetTaskList(dao.DB.WithContext(ctx), req.UserID, dao.TaskType(req.TaskType), req.Page, req.PageSize)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "GetTaskList", "msg", "get task list failed", "error", err)
|
||
return nil, err
|
||
}
|
||
|
||
taskIDs := make([]int64, 0, len(tasks))
|
||
for _, item := range tasks {
|
||
taskIDs = append(taskIDs, item.ID)
|
||
}
|
||
|
||
recognitionResults, err := svc.recognitionResultDao.GetByTaskIDs(dao.DB.WithContext(ctx), taskIDs)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "GetTaskList", "msg", "get recognition results failed", "error", err)
|
||
return nil, err
|
||
}
|
||
|
||
recognitionResultMap := make(map[int64]*dao.RecognitionResult)
|
||
for _, item := range recognitionResults {
|
||
recognitionResultMap[item.TaskID] = item
|
||
}
|
||
|
||
resp := &task.TaskListResponse{
|
||
TaskList: make([]*task.TaskListDTO, 0, len(tasks)),
|
||
Total: total,
|
||
}
|
||
for _, item := range tasks {
|
||
var latex, markdown, mathML, mml string
|
||
recognitionResult := recognitionResultMap[item.ID]
|
||
if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula {
|
||
if fc, err := recognitionResult.GetFormulaContent(); err == nil {
|
||
latex = fc.Latex
|
||
markdown = fc.Markdown
|
||
mathML = fc.MathML
|
||
mml = fc.MML
|
||
}
|
||
}
|
||
// PDF 类型的 TaskListDTO 暂不展开 content(列表页只显示状态)
|
||
originURL, err := oss.GetDownloadURL(ctx, item.FileURL)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err)
|
||
}
|
||
resp.TaskList = append(resp.TaskList, &task.TaskListDTO{
|
||
Latex: latex,
|
||
Markdown: markdown,
|
||
MathML: mathML,
|
||
MML: mml,
|
||
TaskID: item.TaskUUID,
|
||
FileName: item.FileName,
|
||
Status: int(item.Status),
|
||
OriginURL: originURL,
|
||
TaskType: item.TaskType.String(),
|
||
CreatedAt: item.CreatedAt.Format("2006-01-02 15:04:05"),
|
||
})
|
||
}
|
||
return resp, nil
|
||
}
|
||
|
||
func (svc *TaskService) ExportTask(ctx context.Context, req *task.ExportTaskRequest) ([]byte, string, error) {
|
||
recognitionTask, err := svc.recognitionTaskDao.GetByTaskNo(dao.DB.WithContext(ctx), req.TaskNo)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "get task by task id failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
|
||
if recognitionTask == nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "task not found")
|
||
return nil, "", errors.New("task not found")
|
||
}
|
||
|
||
if recognitionTask.Status != dao.TaskStatusCompleted {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "task not finished")
|
||
return nil, "", errors.New("task not finished")
|
||
}
|
||
|
||
recognitionResult, err := svc.recognitionResultDao.GetByTaskID(dao.DB.WithContext(ctx), recognitionTask.ID)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "get recognition result by task id failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
|
||
if recognitionResult == nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "recognition result not found")
|
||
return nil, "", errors.New("recognition result not found")
|
||
}
|
||
|
||
var markdown string
|
||
switch recognitionResult.TaskType {
|
||
case dao.TaskTypeFormula:
|
||
fc, err := recognitionResult.GetFormulaContent()
|
||
if err != nil || fc.Markdown == "" {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err)
|
||
return nil, "", errors.New("markdown not found")
|
||
}
|
||
markdown = fc.Markdown
|
||
default:
|
||
log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType)
|
||
return nil, "", errors.New("unsupported task type for export")
|
||
}
|
||
|
||
// 获取文件名(去掉扩展名)
|
||
filename := strings.TrimSuffix(recognitionTask.FileName, "."+strings.ToLower(strings.Split(recognitionTask.FileName, ".")[len(strings.Split(recognitionTask.FileName, "."))-1]))
|
||
if filename == "" {
|
||
filename = "texpixel"
|
||
}
|
||
|
||
// 构建 JSON 请求体
|
||
requestBody := map[string]string{
|
||
"markdown": markdown,
|
||
"filename": filename,
|
||
}
|
||
jsonData, err := json.Marshal(requestBody)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "json marshal failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
|
||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, "https://cloud.texpixel.com:10443/doc_process/v1/convert/file", bytes.NewReader(jsonData))
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "create http request failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
httpReq.Header.Set("Content-Type", "application/json")
|
||
|
||
client := &http.Client{}
|
||
resp, err := client.Do(httpReq)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "http request failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
defer resp.Body.Close()
|
||
|
||
if resp.StatusCode != http.StatusOK {
|
||
respBody, _ := io.ReadAll(resp.Body)
|
||
log.Error(ctx, "func", "ExportTask", "msg", "export service returned non-200",
|
||
"status", resp.StatusCode,
|
||
"body", string(respBody),
|
||
"markdown_len", len(markdown),
|
||
"filename", filename,
|
||
)
|
||
return nil, "", fmt.Errorf("export service returned status: %d", resp.StatusCode)
|
||
}
|
||
|
||
fileData, err := io.ReadAll(resp.Body)
|
||
if err != nil {
|
||
log.Error(ctx, "func", "ExportTask", "msg", "read response body failed", "error", err)
|
||
return nil, "", err
|
||
}
|
||
|
||
// 新接口只返回 DOCX 格式
|
||
contentType := "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||
|
||
return fileData, contentType, nil
|
||
}
|