feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -89,17 +89,17 @@ func (svc *TaskService) GetTaskList(ctx context.Context, req *task.TaskListReque
|
||||
Total: total,
|
||||
}
|
||||
for _, item := range tasks {
|
||||
var latex string
|
||||
var markdown string
|
||||
var mathML string
|
||||
var mml string
|
||||
var latex, markdown, mathML, mml string
|
||||
recognitionResult := recognitionResultMap[item.ID]
|
||||
if recognitionResult != nil {
|
||||
latex = recognitionResult.Latex
|
||||
markdown = recognitionResult.Markdown
|
||||
mathML = recognitionResult.MathML
|
||||
mml = recognitionResult.MML
|
||||
if recognitionResult != nil && recognitionResult.TaskType == dao.TaskTypeFormula {
|
||||
if fc, err := recognitionResult.GetFormulaContent(); err == nil {
|
||||
latex = fc.Latex
|
||||
markdown = fc.Markdown
|
||||
mathML = fc.MathML
|
||||
mml = fc.MML
|
||||
}
|
||||
}
|
||||
// PDF 类型的 TaskListDTO 暂不展开 content(列表页只显示状态)
|
||||
originURL, err := oss.GetDownloadURL(ctx, item.FileURL)
|
||||
if err != nil {
|
||||
log.Error(ctx, "func", "GetTaskList", "msg", "get origin url failed", "error", err)
|
||||
@@ -148,10 +148,18 @@ func (svc *TaskService) ExportTask(ctx context.Context, req *task.ExportTaskRequ
|
||||
return nil, "", errors.New("recognition result not found")
|
||||
}
|
||||
|
||||
markdown := recognitionResult.Markdown
|
||||
if markdown == "" {
|
||||
log.Error(ctx, "func", "ExportTask", "msg", "markdown not found")
|
||||
return nil, "", errors.New("markdown not found")
|
||||
var markdown string
|
||||
switch recognitionResult.TaskType {
|
||||
case dao.TaskTypeFormula:
|
||||
fc, err := recognitionResult.GetFormulaContent()
|
||||
if err != nil || fc.Markdown == "" {
|
||||
log.Error(ctx, "func", "ExportTask", "msg", "公式结果解析失败或markdown为空", "error", err)
|
||||
return nil, "", errors.New("markdown not found")
|
||||
}
|
||||
markdown = fc.Markdown
|
||||
default:
|
||||
log.Error(ctx, "func", "ExportTask", "msg", "不支持的导出任务类型", "task_type", recognitionResult.TaskType)
|
||||
return nil, "", errors.New("unsupported task type for export")
|
||||
}
|
||||
|
||||
// 获取文件名(去掉扩展名)
|
||||
|
||||
Reference in New Issue
Block a user