Files
CyberStrikeAI/internal/vision/preprocess.go
T
2026-06-03 17:16:48 +08:00

213 lines
5.4 KiB
Go

package vision
import (
"bytes"
"fmt"
"image"
"os"
"strings"
"github.com/disintegration/imaging"
)
// ImagePayload 送入 VL API 的图片字节与 MIME。
type ImagePayload struct {
Bytes []byte
MIMEType string
}
// PreprocessMeta 记录缩放与编码结果,供工具输出与排障。
type PreprocessMeta struct {
OriginalPath string
OriginalBytes int64
OriginalWidth int
OriginalHeight int
OutputWidth int
OutputHeight int
OutputBytes int
OutputMIMEType string
JPEGQuality int // 0 表示未 JPEG 重编码(原图直传)
PreprocessMode string // passthrough | jpeg
}
// PreprocessOptions 图片预处理参数。
type PreprocessOptions struct {
MaxImageBytes int64
MaxDimension int
JPEGQuality int
MaxPayloadBytes int64
SkipPreprocessBelowBytes int64 // 0 = 始终压缩;>0 时小图+尺寸合规可直传
}
// PreprocessImageFile 读取图片;大图或超尺寸走 imaging 缩放+JPEG,否则可原图直传。
func PreprocessImageFile(path string, opt PreprocessOptions) (ImagePayload, PreprocessMeta, error) {
var meta PreprocessMeta
meta.OriginalPath = path
st, err := os.Stat(path)
if err != nil {
return ImagePayload{}, meta, err
}
meta.OriginalBytes = st.Size()
if opt.MaxImageBytes > 0 && st.Size() > opt.MaxImageBytes {
return ImagePayload{}, meta, fmt.Errorf("file size %d exceeds max_image_bytes %d", st.Size(), opt.MaxImageBytes)
}
cfgW, cfgH, format, err := imageDimensions(path)
if err != nil {
return ImagePayload{}, meta, err
}
meta.OriginalWidth = cfgW
meta.OriginalHeight = cfgH
maxDim := opt.MaxDimension
if maxDim <= 0 {
maxDim = 2048
}
maxPayload := opt.MaxPayloadBytes
if maxPayload <= 0 {
maxPayload = 512 * 1024
}
if payload, meta, ok, err := tryPassthrough(path, st.Size(), cfgW, cfgH, format, opt, maxDim, maxPayload); ok {
return payload, meta, err
}
return compressWithImaging(path, opt, maxDim, maxPayload, meta)
}
func tryPassthrough(path string, size int64, w, h int, format string, opt PreprocessOptions, maxDim int, maxPayload int64) (ImagePayload, PreprocessMeta, bool, error) {
var meta PreprocessMeta
meta.OriginalPath = path
meta.OriginalBytes = size
meta.OriginalWidth = w
meta.OriginalHeight = h
threshold := opt.SkipPreprocessBelowBytes
if threshold <= 0 {
return ImagePayload{}, meta, false, nil
}
if size > threshold {
return ImagePayload{}, meta, false, nil
}
longEdge := w
if h > longEdge {
longEdge = h
}
if longEdge > maxDim {
return ImagePayload{}, meta, false, nil
}
if size > maxPayload {
return ImagePayload{}, meta, false, nil
}
raw, err := os.ReadFile(path)
if err != nil {
return ImagePayload{}, meta, false, err
}
mime := mimeFromImageFormat(format)
if mime == "" {
return ImagePayload{}, meta, false, nil
}
meta.OutputWidth = w
meta.OutputHeight = h
meta.OutputBytes = len(raw)
meta.OutputMIMEType = mime
meta.PreprocessMode = "passthrough"
return ImagePayload{Bytes: raw, MIMEType: mime}, meta, true, nil
}
func compressWithImaging(path string, opt PreprocessOptions, maxDim int, maxPayload int64, meta PreprocessMeta) (ImagePayload, PreprocessMeta, error) {
src, err := imaging.Open(path)
if err != nil {
return ImagePayload{}, meta, fmt.Errorf("open image: %w", err)
}
bounds := src.Bounds()
meta.OriginalWidth = bounds.Dx()
meta.OriginalHeight = bounds.Dy()
dst := imaging.Fit(src, maxDim, maxDim, imaging.Lanczos)
outBounds := dst.Bounds()
meta.OutputWidth = outBounds.Dx()
meta.OutputHeight = outBounds.Dy()
quality := opt.JPEGQuality
if quality <= 0 || quality > 100 {
quality = 82
}
dim := maxDim
for attempt := 0; attempt < 6; attempt++ {
if attempt > 0 {
dim = int(float64(dim) * 0.85)
if dim < 256 {
dim = 256
}
dst = imaging.Fit(src, dim, dim, imaging.Lanczos)
outBounds = dst.Bounds()
meta.OutputWidth = outBounds.Dx()
meta.OutputHeight = outBounds.Dy()
}
q := quality
for q >= 60 {
var buf bytes.Buffer
if err := imaging.Encode(&buf, dst, imaging.JPEG, imaging.JPEGQuality(q)); err != nil {
return ImagePayload{}, meta, fmt.Errorf("encode jpeg: %w", err)
}
if int64(buf.Len()) <= maxPayload {
meta.JPEGQuality = q
meta.OutputBytes = buf.Len()
meta.OutputMIMEType = "image/jpeg"
meta.PreprocessMode = "jpeg"
return ImagePayload{Bytes: buf.Bytes(), MIMEType: "image/jpeg"}, meta, nil
}
q -= 5
}
quality = 75
}
return ImagePayload{}, meta, fmt.Errorf("could not compress image under max_payload_bytes %d", maxPayload)
}
func imageDimensions(path string) (w, h int, format string, err error) {
f, err := os.Open(path)
if err != nil {
return 0, 0, "", err
}
defer f.Close()
cfg, format, err := image.DecodeConfig(f)
if err != nil {
return 0, 0, "", fmt.Errorf("decode image config: %w", err)
}
return cfg.Width, cfg.Height, format, nil
}
func mimeFromImageFormat(format string) string {
switch strings.ToLower(strings.TrimSpace(format)) {
case "jpeg", "jpg":
return "image/jpeg"
case "png":
return "image/png"
case "gif":
return "image/gif"
case "webp":
return "image/webp"
case "bmp":
return "image/bmp"
case "tiff":
return "image/tiff"
default:
return ""
}
}
// DecodeImageConfig 用于测试:确认文件可被解码。
func DecodeImageConfig(path string) (image.Config, string, error) {
f, err := os.Open(path)
if err != nil {
return image.Config{}, "", err
}
defer f.Close()
return image.DecodeConfig(f)
}