feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -70,7 +70,7 @@ func (h *OSSEndpoint) GetSignatureURL(ctx *gin.Context) {
|
||||
ctx.JSON(http.StatusOK, common.ErrorResponse(ctx, common.CodeParamError, "invalid file name"))
|
||||
return
|
||||
}
|
||||
if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"}) {
|
||||
if !utils.InArray(extend, []string{".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp", ".pdf"}) {
|
||||
ctx.JSON(http.StatusOK, common.ErrorResponse(ctx, common.CodeParamError, "invalid file type"))
|
||||
return
|
||||
}
|
||||
|
||||
95
api/v1/pdf/handler.go
Normal file
95
api/v1/pdf/handler.go
Normal file
@@ -0,0 +1,95 @@
|
||||
package pdf
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf"
|
||||
"gitea.com/texpixel/document_ai/internal/service"
|
||||
"gitea.com/texpixel/document_ai/pkg/common"
|
||||
"gitea.com/texpixel/document_ai/pkg/constant"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
)
|
||||
|
||||
type PDFEndpoint struct {
|
||||
pdfService *service.PDFRecognitionService
|
||||
}
|
||||
|
||||
func NewPDFEndpoint() *PDFEndpoint {
|
||||
return &PDFEndpoint{
|
||||
pdfService: service.NewPDFRecognitionService(),
|
||||
}
|
||||
}
|
||||
|
||||
// CreateTask godoc
|
||||
// @Summary Create a PDF recognition task
|
||||
// @Description Create a new PDF recognition task (max 10 pages processed)
|
||||
// @Tags PDF
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param request body pdfmodel.CreatePDFRecognitionRequest true "Create PDF task request"
|
||||
// @Success 200 {object} common.Response{data=pdfmodel.CreatePDFTaskResponse}
|
||||
// @Failure 400 {object} common.Response
|
||||
// @Failure 500 {object} common.Response
|
||||
// @Router /v1/pdf/recognition [post]
|
||||
func (e *PDFEndpoint) CreateTask(c *gin.Context) {
|
||||
var req pdfmodel.CreatePDFRecognitionRequest
|
||||
if err := c.BindJSON(&req); err != nil {
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "参数错误"))
|
||||
return
|
||||
}
|
||||
req.UserID = c.GetInt64(constant.ContextUserID)
|
||||
|
||||
if strings.ToLower(filepath.Ext(req.FileName)) != ".pdf" {
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "仅支持PDF文件"))
|
||||
return
|
||||
}
|
||||
|
||||
task, err := e.pdfService.CreatePDFTask(c, &req)
|
||||
if err != nil {
|
||||
if bizErr, ok := err.(*common.BusinessError); ok {
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, int(bizErr.Code), bizErr.Message))
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeSystemError, "创建任务失败"))
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, common.SuccessResponse(c, &pdfmodel.CreatePDFTaskResponse{
|
||||
TaskNo: task.TaskUUID,
|
||||
Status: int(task.Status),
|
||||
}))
|
||||
}
|
||||
|
||||
// GetTaskStatus godoc
|
||||
// @Summary Get PDF recognition task status and results
|
||||
// @Description Poll task status; pages field populated when status=2 (completed)
|
||||
// @Tags PDF
|
||||
// @Accept json
|
||||
// @Produce json
|
||||
// @Param task_no path string true "Task No"
|
||||
// @Success 200 {object} common.Response{data=pdfmodel.GetPDFTaskResponse}
|
||||
// @Failure 404 {object} common.Response
|
||||
// @Failure 500 {object} common.Response
|
||||
// @Router /v1/pdf/recognition/{task_no} [get]
|
||||
func (e *PDFEndpoint) GetTaskStatus(c *gin.Context) {
|
||||
var req pdfmodel.GetPDFTaskRequest
|
||||
if err := c.ShouldBindUri(&req); err != nil {
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeParamError, "参数错误"))
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := e.pdfService.GetPDFTask(c, req.TaskNo)
|
||||
if err != nil {
|
||||
if bizErr, ok := err.(*common.BusinessError); ok {
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, int(bizErr.Code), bizErr.Message))
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, common.ErrorResponse(c, common.CodeSystemError, "查询任务失败"))
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, common.SuccessResponse(c, resp))
|
||||
}
|
||||
Reference in New Issue
Block a user