refactor: replace pdftoppm with go-fitz for in-process PDF rendering
Switch PDF page rendering from external pdftoppm/pdftocairo subprocess calls to github.com/gen2brain/go-fitz (MuPDF wrapper), eliminating the poppler-utils runtime dependency. Enable CGO in Dockerfile builder stage and install gcc/musl-dev for the static MuPDF link; runtime image remains unchanged.
This commit is contained in:
@@ -6,14 +6,13 @@ import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"image/png"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/gen2brain/go-fitz"
|
||||
|
||||
pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf"
|
||||
"gitea.com/texpixel/document_ai/internal/storage/cache"
|
||||
"gitea.com/texpixel/document_ai/internal/storage/dao"
|
||||
@@ -221,8 +220,7 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
|
||||
}
|
||||
|
||||
processPages := len(pageImages)
|
||||
log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF",
|
||||
"task_id", taskID, "process_pages", processPages)
|
||||
log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF", "task_id", taskID, "process_pages", processPages)
|
||||
|
||||
// 逐页 OCR,结果收集
|
||||
var pages []dao.PDFPageContent
|
||||
@@ -236,8 +234,7 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
|
||||
PageNumber: i + 1,
|
||||
Markdown: ocrResult.Markdown,
|
||||
})
|
||||
log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成",
|
||||
"page", i+1, "total", processPages)
|
||||
log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成", "page", i+1, "total", processPages)
|
||||
}
|
||||
|
||||
// 序列化并写入 DB(单行)
|
||||
@@ -262,47 +259,40 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
|
||||
return nil
|
||||
}
|
||||
|
||||
// renderPDFPages 使用 pdftoppm 将 PDF 渲染为 PNG 字节切片,最多渲染 maxPages 页
|
||||
// renderPDFPages 使用 go-fitz 将 PDF 渲染为 PNG 字节切片,最多渲染 maxPages 页
|
||||
func renderPDFPages(ctx context.Context, pdfBytes []byte, maxPages int) ([][]byte, error) {
|
||||
tmpDir, err := os.MkdirTemp("", "pdf-ocr-*")
|
||||
doc, err := fitz.NewFromMemory(pdfBytes)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("创建临时目录失败: %w", err)
|
||||
return nil, fmt.Errorf("打开PDF失败: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
defer doc.Close()
|
||||
|
||||
pdfPath := filepath.Join(tmpDir, "input.pdf")
|
||||
if err := os.WriteFile(pdfPath, pdfBytes, 0600); err != nil {
|
||||
return nil, fmt.Errorf("写入临时PDF失败: %w", err)
|
||||
total := doc.NumPage()
|
||||
if total == 0 {
|
||||
return nil, fmt.Errorf("PDF不包含任何页面")
|
||||
}
|
||||
if maxPages > 0 && total > maxPages {
|
||||
total = maxPages
|
||||
}
|
||||
|
||||
outPrefix := filepath.Join(tmpDir, "page")
|
||||
cmd := exec.CommandContext(ctx, "pdftoppm",
|
||||
"-r", "150",
|
||||
"-png",
|
||||
"-l", fmt.Sprintf("%d", maxPages),
|
||||
pdfPath,
|
||||
outPrefix,
|
||||
)
|
||||
if out, err := cmd.CombinedOutput(); err != nil {
|
||||
return nil, fmt.Errorf("pdftoppm失败: %w, output: %s", err, string(out))
|
||||
}
|
||||
|
||||
files, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("查找渲染输出文件失败: %w", err)
|
||||
}
|
||||
if len(files) == 0 {
|
||||
return nil, fmt.Errorf("pdftoppm未输出任何页面")
|
||||
}
|
||||
sort.Strings(files)
|
||||
|
||||
pages := make([][]byte, 0, len(files))
|
||||
for _, f := range files {
|
||||
data, err := os.ReadFile(f)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("读取页面图片失败: %w", err)
|
||||
pages := make([][]byte, 0, total)
|
||||
for i := 0; i < total; i++ {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
pages = append(pages, data)
|
||||
|
||||
img, err := doc.Image(i)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("渲染第%d页失败: %w", i+1, err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := png.Encode(&buf, img); err != nil {
|
||||
return nil, fmt.Errorf("编码第%d页PNG失败: %w", i+1, err)
|
||||
}
|
||||
pages = append(pages, buf.Bytes())
|
||||
}
|
||||
|
||||
return pages, nil
|
||||
|
||||
Reference in New Issue
Block a user