refactor: replace pdftoppm with go-fitz for in-process PDF rendering

Switch PDF page rendering from external pdftoppm/pdftocairo subprocess calls
to github.com/gen2brain/go-fitz (MuPDF wrapper), eliminating the poppler-utils
runtime dependency. Enable CGO in Dockerfile builder stage and install gcc/musl-dev
for the static MuPDF link; runtime image remains unchanged.
This commit is contained in:
2026-03-31 21:21:17 +08:00
parent 86dacb61a6
commit 84ce6f6b92
4 changed files with 48 additions and 90 deletions

View File

@@ -6,14 +6,13 @@ import (
"encoding/base64"
"encoding/json"
"fmt"
"image/png"
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"sort"
"time"
"github.com/gen2brain/go-fitz"
pdfmodel "gitea.com/texpixel/document_ai/internal/model/pdf"
"gitea.com/texpixel/document_ai/internal/storage/cache"
"gitea.com/texpixel/document_ai/internal/storage/dao"
@@ -221,8 +220,7 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
}
processPages := len(pageImages)
log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF",
"task_id", taskID, "process_pages", processPages)
log.Info(ctx, "func", "processPDFTask", "msg", "开始处理PDF", "task_id", taskID, "process_pages", processPages)
// 逐页 OCR结果收集
var pages []dao.PDFPageContent
@@ -236,8 +234,7 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
PageNumber: i + 1,
Markdown: ocrResult.Markdown,
})
log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成",
"page", i+1, "total", processPages)
log.Info(ctx, "func", "processPDFTask", "msg", "页面OCR完成", "page", i+1, "total", processPages)
}
// 序列化并写入 DB单行
@@ -262,47 +259,40 @@ func (s *PDFRecognitionService) processPDFTask(ctx context.Context, taskID int64
return nil
}
// renderPDFPages 使用 pdftoppm 将 PDF 渲染为 PNG 字节切片,最多渲染 maxPages 页
// renderPDFPages 使用 go-fitz 将 PDF 渲染为 PNG 字节切片,最多渲染 maxPages 页
func renderPDFPages(ctx context.Context, pdfBytes []byte, maxPages int) ([][]byte, error) {
tmpDir, err := os.MkdirTemp("", "pdf-ocr-*")
doc, err := fitz.NewFromMemory(pdfBytes)
if err != nil {
return nil, fmt.Errorf("创建临时目录失败: %w", err)
return nil, fmt.Errorf("打开PDF失败: %w", err)
}
defer os.RemoveAll(tmpDir)
defer doc.Close()
pdfPath := filepath.Join(tmpDir, "input.pdf")
if err := os.WriteFile(pdfPath, pdfBytes, 0600); err != nil {
return nil, fmt.Errorf("写入临时PDF失败: %w", err)
total := doc.NumPage()
if total == 0 {
return nil, fmt.Errorf("PDF不包含任何页面")
}
if maxPages > 0 && total > maxPages {
total = maxPages
}
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.CommandContext(ctx, "pdftoppm",
"-r", "150",
"-png",
"-l", fmt.Sprintf("%d", maxPages),
pdfPath,
outPrefix,
)
if out, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("pdftoppm失败: %w, output: %s", err, string(out))
}
files, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if err != nil {
return nil, fmt.Errorf("查找渲染输出文件失败: %w", err)
}
if len(files) == 0 {
return nil, fmt.Errorf("pdftoppm未输出任何页面")
}
sort.Strings(files)
pages := make([][]byte, 0, len(files))
for _, f := range files {
data, err := os.ReadFile(f)
if err != nil {
return nil, fmt.Errorf("读取页面图片失败: %w", err)
pages := make([][]byte, 0, total)
for i := 0; i < total; i++ {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
pages = append(pages, data)
img, err := doc.Image(i)
if err != nil {
return nil, fmt.Errorf("渲染第%d页失败: %w", i+1, err)
}
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
return nil, fmt.Errorf("编码第%d页PNG失败: %w", i+1, err)
}
pages = append(pages, buf.Bytes())
}
return pages, nil