1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- package mistral3
- import (
- "fmt"
- "image"
- _ "image/jpeg"
- _ "image/png"
- "io"
- "math"
- "github.com/ollama/ollama/ml"
- "github.com/ollama/ollama/model/imageproc"
- )
- func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
- return image.Point{
- (imageSize.X-1)/patchSize.X + 1,
- (imageSize.Y-1)/patchSize.Y + 1,
- }
- }
- func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
- b := img.Bounds()
- ratio := math.Max(float64(b.Max.Y)/float64(longestEdge), float64(b.Max.X)/float64(longestEdge))
- newSize := img.Bounds().Max
- if ratio > 1.0 {
- newSize = image.Point{
- int(math.Floor(float64(b.Max.X) / ratio)),
- int(math.Floor(float64(b.Max.Y) / ratio)),
- }
- }
- tokens := getNumImageTokens(newSize, patchSize)
- return image.Point{
- tokens.X * patchSize.X,
- tokens.Y * patchSize.Y,
- }
- }
- func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
- if format == "png" {
- img = imageproc.Composite(img)
- }
- newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
- // todo should be ResizeBicubic, but it doesn't exist
- return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
- }
- func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
- img, format, err := image.Decode(imageData)
- if err != nil {
- return nil, nil, fmt.Errorf("failed to decode image: %w", err)
- }
- longestEdge := 1024
- patchSize := image.Point{16, 16}
- img = resizeImage(img, format, longestEdge, patchSize)
- data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
- opts := map[string]any{}
- return data, opts, nil
- }
- type ImageProcessor struct {
- imageSize int
- patchSize int
- numChannels int
- longestEdge int
- }
- func newImageProcessor(c ml.Config) ImageProcessor {
- return ImageProcessor{
- imageSize: int(c.Uint("vision.image_size", 1540)),
- patchSize: int(c.Uint("vision.patch_size", 14)),
- numChannels: int(c.Uint("vision.num_channels", 3)),
- longestEdge: int(c.Uint("vision.longest_edge", 1540)),
- }
- }
- func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
- outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
- newImage := imageproc.Composite(img)
- newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
- data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
- return data, nil
- }
|