feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"gitea.com/texpixel/document_ai/api/v1/analytics"
|
||||
"gitea.com/texpixel/document_ai/api/v1/formula"
|
||||
"gitea.com/texpixel/document_ai/api/v1/oss"
|
||||
"gitea.com/texpixel/document_ai/api/v1/pdf"
|
||||
"gitea.com/texpixel/document_ai/api/v1/task"
|
||||
"gitea.com/texpixel/document_ai/api/v1/user"
|
||||
"gitea.com/texpixel/document_ai/pkg/common"
|
||||
@@ -55,6 +56,13 @@ func SetupRouter(engine *gin.RouterGroup) {
|
||||
userAuthRouter.GET("/info", common.MustAuthMiddleware(), userEndpoint.GetUserInfo)
|
||||
}
|
||||
|
||||
pdfRouter := v1.Group("/pdf", common.GetAuthMiddleware())
|
||||
{
|
||||
endpoint := pdf.NewPDFEndpoint()
|
||||
pdfRouter.POST("/recognition", endpoint.CreateTask)
|
||||
pdfRouter.GET("/recognition/:task_no", endpoint.GetTaskStatus)
|
||||
}
|
||||
|
||||
// 数据埋点路由
|
||||
analyticsRouter := v1.Group("/analytics", common.GetAuthMiddleware())
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user