Files
doc_processer/app/services/image_processor.py
2026-01-01 23:38:52 +08:00

171 lines
5.5 KiB
Python

"""Image preprocessing service using OpenCV."""
import base64
import io
from urllib.request import urlopen
import cv2
import numpy as np
from PIL import Image
from app.core.config import get_settings
settings = get_settings()
class ImageProcessor:
"""Service for image preprocessing operations."""
def __init__(self, padding_ratio: float | None = None):
"""Initialize with padding ratio.
Args:
padding_ratio: Ratio for padding on each side (default from settings).
0.15 means 15% padding on each side = 30% total expansion.
"""
self.padding_ratio = padding_ratio or settings.image_padding_ratio
def _convert_to_bgr(self, pil_image: Image.Image) -> np.ndarray:
"""Convert PIL Image to BGR numpy array, handling alpha channel.
Args:
pil_image: PIL Image object.
Returns:
Image as numpy array in BGR format.
"""
# Handle RGBA images (PNG with transparency)
if pil_image.mode == "RGBA":
# Create white background and paste image on top
background = Image.new("RGB", pil_image.size, (255, 255, 255))
background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha as mask
pil_image = background
elif pil_image.mode == "LA":
# Grayscale with alpha
background = Image.new("L", pil_image.size, 255)
background.paste(pil_image, mask=pil_image.split()[1])
pil_image = background.convert("RGB")
elif pil_image.mode == "P":
# Palette mode, may have transparency
pil_image = pil_image.convert("RGBA")
background = Image.new("RGB", pil_image.size, (255, 255, 255))
background.paste(pil_image, mask=pil_image.split()[3])
pil_image = background
elif pil_image.mode != "RGB":
# Convert other modes to RGB
pil_image = pil_image.convert("RGB")
return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
def load_image_from_url(self, url: str) -> np.ndarray:
"""Load image from URL.
Args:
url: Image URL to fetch.
Returns:
Image as numpy array in BGR format.
Raises:
ValueError: If image cannot be loaded from URL.
"""
try:
with urlopen(url, timeout=30) as response:
image_data = response.read()
pil_image = Image.open(io.BytesIO(image_data))
return self._convert_to_bgr(pil_image)
except Exception as e:
raise ValueError(f"Failed to load image from URL: {e}") from e
def load_image_from_base64(self, base64_str: str) -> np.ndarray:
"""Load image from base64 string.
Args:
base64_str: Base64-encoded image data.
Returns:
Image as numpy array in BGR format.
Raises:
ValueError: If image cannot be decoded.
"""
try:
# Handle data URL format
if "," in base64_str:
base64_str = base64_str.split(",", 1)[1]
image_data = base64.b64decode(base64_str)
pil_image = Image.open(io.BytesIO(image_data))
return self._convert_to_bgr(pil_image)
except Exception as e:
raise ValueError(f"Failed to decode base64 image: {e}") from e
def add_padding(self, image: np.ndarray) -> np.ndarray:
"""Add whitespace padding around the image.
Adds padding equal to padding_ratio * max(height, width) on each side.
This expands the image by approximately 30% total (15% on each side).
Args:
image: Input image as numpy array in BGR format.
Returns:
Padded image as numpy array.
"""
height, width = image.shape[:2]
padding = int(max(height, width) * self.padding_ratio)
# Add white padding on all sides
padded_image = cv2.copyMakeBorder(
image,
top=padding,
bottom=padding,
left=padding,
right=padding,
borderType=cv2.BORDER_CONSTANT,
value=[255, 255, 255], # White
)
return padded_image
def preprocess(self, image_url: str | None, image_base64: str | None) -> np.ndarray:
"""Load and preprocess image with padding.
Args:
image_url: URL to fetch image from (optional).
image_base64: Base64-encoded image (optional).
Returns:
Preprocessed image with padding.
Raises:
ValueError: If neither input is provided or loading fails.
"""
if image_url:
image = self.load_image_from_url(image_url)
elif image_base64:
image = self.load_image_from_base64(image_base64)
else:
raise ValueError("Either image_url or image_base64 must be provided")
return image
def image_to_base64(self, image: np.ndarray, format: str = "PNG") -> str:
"""Convert numpy image to base64 string.
Args:
image: Image as numpy array in BGR format.
format: Output format (PNG, JPEG).
Returns:
Base64-encoded image string.
"""
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image_rgb)
buffer = io.BytesIO()
pil_image.save(buffer, format=format)
buffer.seek(0)
return base64.b64encode(buffer.getvalue()).decode("utf-8")