OpenSource
/
ollama


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
							package imageproc

import (
	"bytes"
	"fmt"
	"image"
	"image/color"
	_ "image/jpeg"
	_ "image/png"
	"math"
	"slices"

	"golang.org/x/image/draw"
)

func GetSupportedAspectRatios(maxTiles int) []image.Point {
	ratios := []image.Point{}

	for w := range maxTiles {
		for h := range maxTiles {
			if (w+1)*(h+1) <= maxTiles {
				ratios = append(ratios, image.Point{w + 1, h + 1})
			}
		}
	}

	return ratios
}

func clip(a, a_min, a_max int) int {
	if a < a_min {
		return a_min
	} else if a > a_max {
		return a_max
	}

	return a
}

func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
	targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
	targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)

	scaleWidth := float64(targetWidth) / float64(imageSize.X)
	scaleHeight := float64(targetHeight) / float64(imageSize.Y)

	var w, h int

	if scaleWidth < scaleHeight {
		w = targetWidth
		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
	} else {
		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
		h = targetHeight
	}

	return image.Point{w, h}
}

func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
	possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
	possibleCanvasSizes := []image.Point{}
	for _, pta := range possibleTileArrangements {
		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
	}

	scales := []float64{}

	for _, pcs := range possibleCanvasSizes {
		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
		scaleWidth := float64(pcs.X) / float64(imageSize.X)

		if scaleWidth > scaleHeight {
			scales = append(scales, scaleHeight)
		} else {
			scales = append(scales, scaleWidth)
		}
	}

	var minUpscale float64
	var maxDownscale float64
	var upscale bool

	for _, s := range scales {
		if s > 1.0 {
			upscale = true
			if minUpscale == 0 {
				minUpscale = s
			} else {
				minUpscale = math.Min(minUpscale, s)
			}
		} else {
			maxDownscale = math.Max(maxDownscale, s)
		}
	}

	selectedScale := maxDownscale
	if upscale {
		selectedScale = minUpscale
	}

	var selectedCanvas image.Point
	for n, pcs := range possibleCanvasSizes {
		if scales[n] == selectedScale {
			// choose the smallest possible canvas
			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
				selectedCanvas = pcs
			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
				selectedCanvas = pcs
			}
		}
	}
	return selectedCanvas
}

func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
	b := img.Bounds()
	width := b.Max.X - b.Min.X
	height := b.Max.Y - b.Min.Y
	tileHeight := height / numTilesSize.Y
	tileWidth := width / numTilesSize.X

	images := []image.Image{}

	for h := range numTilesSize.Y {
		for w := range numTilesSize.X {
			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
			images = append(images, img.(interface {
				SubImage(image.Rectangle) image.Image
			}).SubImage(rect))
		}
	}

	return images
}

// remove the "alpha" channel by drawing over a prefilled image
func compositeImage(img image.Image) image.Image {
	dst := image.NewRGBA(img.Bounds())

	white := color.RGBA{255, 255, 255, 255}
	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)

	return dst
}

func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
	if format == "png" {
		img = compositeImage(img)
	}

	b := img.Bounds()
	tileSize := outputSize.Y

	canvasSize := getOptimalTiledCanvas(b.Max, maxImageTiles, tileSize)
	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
	newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)

	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))

	// scaling choices:
	//   NearestNeighbor	fast, blocky output
	//   ApproxBiLinear	fast, medium quality
	//   BiLinear		slow, high quality
	//   CatmullRom		very slow, very high quality
	draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)

	return dst, aspectRatio
}

func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
	paddedSize := image.Point{
		X: outputSize.X * aspectRatio.X,
		Y: outputSize.Y * aspectRatio.Y,
	}

	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)

	return dst
}

func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
	subImages := splitToTiles(img, aspectRatio)

	var pixelVals []float32

	for _, subImg := range subImages {
		bounds := subImg.Bounds()
		var rVals, gVals, bVals []float32
		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
			for x := bounds.Min.X; x < bounds.Max.X; x++ {
				c := subImg.At(x, y)
				r, g, b, _ := c.RGBA()
				rVal := float32(r>>8) / 255.0
				gVal := float32(g>>8) / 255.0
				bVal := float32(b>>8) / 255.0

				rVal = (rVal - mean[0]) / std[0]
				gVal = (gVal - mean[1]) / std[1]
				bVal = (bVal - mean[2]) / std[2]

				rVals = append(rVals, rVal)
				gVals = append(gVals, gVal)
				bVals = append(bVals, bVal)
			}
		}
		pixelVals = append(pixelVals, rVals...)
		pixelVals = append(pixelVals, gVals...)
		pixelVals = append(pixelVals, bVals...)
	}

	return pixelVals
}

func Preprocess(imageData []byte) ([]float32, int, error) {
	// todo: need guard in here for bad image data

	// mllama values
	outputSize := image.Point{560, 560}
	maxTiles := 4

	// clip values
	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
	std := [3]float32{0.26862954, 0.26130258, 0.27577711}

	img, format, err := image.Decode(bytes.NewReader(imageData))
	if err != nil {
		return nil, 0, fmt.Errorf("failed to decode image: %w", err)
	}

	newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
	newImage = PadImage(newImage, outputSize, aspectRatio)

	data := PackImages(newImage, aspectRatio, mean, std)
	aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1

	return data, aspectRatioIndex, nil
}