feat: add PDF document recognition with 10-page pre-hook
- Migrate recognition_results table to JSON schema (meta_data + content),
replacing flat latex/markdown/mathml/mml columns
- Add TaskTypePDF constant and update all formula read/write paths
- Add PDFRecognitionService using pdftoppm (Poppler) for CGO-free page
rendering; limits processing to first 10 pages (pre-hook)
- Reuse existing downstream OCR endpoint (cloud.texpixel.com) for each
page image; stores results as [{page_number, markdown}] JSON array
- Add Redis queue + distributed lock for PDF worker goroutine
- Add REST endpoints: POST /v1/pdf/recognition, GET /v1/pdf/recognition/:task_no
- Add .pdf to OSS upload file type whitelist
- Add migrations/pdf_recognition.sql for safe data migration
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
32
migrations/pdf_recognition.sql
Normal file
32
migrations/pdf_recognition.sql
Normal file
@@ -0,0 +1,32 @@
|
||||
-- migrations/pdf_recognition.sql
|
||||
-- 将 recognition_results 表重构为 JSON content schema
|
||||
-- 执行顺序:加新列 → 洗历史数据 → 删旧列
|
||||
|
||||
-- Step 1: 新增 JSON 字段(保留旧字段,等数据迁移完再删)
|
||||
ALTER TABLE `recognition_results`
|
||||
ADD COLUMN `meta_data` JSON DEFAULT NULL COMMENT '元数据 {"total_num":1}' AFTER `task_type`,
|
||||
ADD COLUMN `content` JSON DEFAULT NULL COMMENT '识别内容 JSON' AFTER `meta_data`;
|
||||
|
||||
-- Step 2: 将旧列数据洗入新 JSON 字段
|
||||
-- 所有现有记录均为 FORMULA 类型(单页),meta_data.total_num = 1
|
||||
-- content 结构: {"latex":"...","markdown":"...","mathml":"...","mml":"..."}
|
||||
UPDATE `recognition_results`
|
||||
SET
|
||||
`meta_data` = JSON_OBJECT('total_num', 1),
|
||||
`content` = JSON_OBJECT(
|
||||
'latex', IFNULL(`latex`, ''),
|
||||
'markdown', IFNULL(`markdown`, ''),
|
||||
'mathml', IFNULL(`mathml`, ''),
|
||||
'mml', IFNULL(`mml`, '')
|
||||
)
|
||||
WHERE `content` IS NULL;
|
||||
|
||||
-- Step 3: 验证数据洗涤完成(应返回 0)
|
||||
-- SELECT COUNT(*) FROM `recognition_results` WHERE `content` IS NULL;
|
||||
|
||||
-- Step 4: 删除旧字段
|
||||
ALTER TABLE `recognition_results`
|
||||
DROP COLUMN `latex`,
|
||||
DROP COLUMN `markdown`,
|
||||
DROP COLUMN `mathml`,
|
||||
DROP COLUMN `mml`;
|
||||
Reference in New Issue
Block a user