Files
doc_ai_backed/migrations/pdf_recognition.sql

33 lines
1.2 KiB
MySQL
Raw Normal View History

-- migrations/pdf_recognition.sql
-- 将 recognition_results 表重构为 JSON content schema
-- 执行顺序:加新列 → 洗历史数据 → 删旧列
-- Step 1: 新增 JSON 字段(保留旧字段,等数据迁移完再删)
ALTER TABLE `recognition_results`
ADD COLUMN `meta_data` JSON DEFAULT NULL COMMENT '元数据 {"total_num":1}' AFTER `task_type`,
ADD COLUMN `content` JSON DEFAULT NULL COMMENT '识别内容 JSON' AFTER `meta_data`;
-- Step 2: 将旧列数据洗入新 JSON 字段
-- 所有现有记录均为 FORMULA 类型单页meta_data.total_num = 1
-- content 结构: {"latex":"...","markdown":"...","mathml":"...","mml":"..."}
UPDATE `recognition_results`
SET
`meta_data` = JSON_OBJECT('total_num', 1),
`content` = JSON_OBJECT(
'latex', IFNULL(`latex`, ''),
'markdown', IFNULL(`markdown`, ''),
'mathml', IFNULL(`mathml`, ''),
'mml', IFNULL(`mml`, '')
)
WHERE `content` IS NULL;
-- Step 3: 验证数据洗涤完成(应返回 0
-- SELECT COUNT(*) FROM `recognition_results` WHERE `content` IS NULL;
-- Step 4: 删除旧字段
ALTER TABLE `recognition_results`
DROP COLUMN `latex`,
DROP COLUMN `markdown`,
DROP COLUMN `mathml`,
DROP COLUMN `mml`;