123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- package mistral3
- import (
- "fmt"
- "image"
- _ "image/jpeg"
- _ "image/png"
- "io"
- "math"
- "github.com/ollama/ollama/ml"
- "github.com/ollama/ollama/model/imageproc"
- )
- func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
- return image.Point{
- (imageSize.X-1)/patchSize.X + 1,
- (imageSize.Y-1)/patchSize.Y + 1,
- }
- }
- func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
- b := img.Bounds()
- le := float64(longestEdge)
- ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
- newSize := img.Bounds().Max
- if ratio > 1.0 {
- newSize = image.Point{
- int(math.Floor(float64(b.Max.X) / ratio)),
- int(math.Floor(float64(b.Max.Y) / ratio)),
- }
- }
- tokens := getNumImageTokens(newSize, patchSize)
- return image.Point{
- tokens.X * patchSize.X,
- tokens.Y * patchSize.Y,
- }
- }
- func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
- if format == "png" {
- img = imageproc.Composite(img)
- }
- newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
- // todo should be ResizeBicubic, but it doesn't exist
- return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
- }
- func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
- img, format, err := image.Decode(imageData)
- if err != nil {
- return nil, nil, fmt.Errorf("failed to decode image: %w", err)
- }
- longestEdge := 1024
- patchSize := image.Point{16, 16}
- img = resizeImage(img, format, longestEdge, patchSize)
- data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
- opts := map[string]any{}
- return data, opts, nil
- }
- type ImageProcessor struct {
- imageSize int
- patchSize int
- numChannels int
- longestEdge int
- }
- func newImageProcessor(c ml.Config) ImageProcessor {
- return ImageProcessor{
- imageSize: int(c.Uint("vision.image_size", 1540)),
- patchSize: int(c.Uint("vision.patch_size", 14)),
- numChannels: int(c.Uint("vision.num_channels", 3)),
- longestEdge: int(c.Uint("vision.longest_edge", 1024)),
- }
- }
- func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
- outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
- newImage := imageproc.Composite(img)
- newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
- data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
- return data, nil
- }
|