|
@@ -1,19 +1,20 @@
|
|
-package imageproc
|
|
|
|
|
|
+package mllama
|
|
|
|
|
|
import (
|
|
import (
|
|
- "bytes"
|
|
|
|
"fmt"
|
|
"fmt"
|
|
"image"
|
|
"image"
|
|
- "image/color"
|
|
|
|
_ "image/jpeg"
|
|
_ "image/jpeg"
|
|
_ "image/png"
|
|
_ "image/png"
|
|
|
|
+ "io"
|
|
"math"
|
|
"math"
|
|
"slices"
|
|
"slices"
|
|
|
|
|
|
"golang.org/x/image/draw"
|
|
"golang.org/x/image/draw"
|
|
|
|
+
|
|
|
|
+ "github.com/ollama/ollama/model/imageproc"
|
|
)
|
|
)
|
|
|
|
|
|
-func GetSupportedAspectRatios(maxTiles int) []image.Point {
|
|
|
|
|
|
+func getSupportedAspectRatios(maxTiles int) []image.Point {
|
|
ratios := []image.Point{}
|
|
ratios := []image.Point{}
|
|
|
|
|
|
for w := range maxTiles {
|
|
for w := range maxTiles {
|
|
@@ -37,28 +38,8 @@ func clip(a, a_min, a_max int) int {
|
|
return a
|
|
return a
|
|
}
|
|
}
|
|
|
|
|
|
-func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
|
|
|
- targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
|
|
|
- targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
|
|
|
-
|
|
|
|
- scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
|
|
|
- scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
|
|
|
-
|
|
|
|
- var w, h int
|
|
|
|
-
|
|
|
|
- if scaleWidth < scaleHeight {
|
|
|
|
- w = targetWidth
|
|
|
|
- h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
|
|
|
- } else {
|
|
|
|
- w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
|
|
|
- h = targetHeight
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return image.Point{w, h}
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
|
func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
|
|
- possibleTileArrangements := GetSupportedAspectRatios(maxImageTiles)
|
|
|
|
|
|
+ possibleTileArrangements := getSupportedAspectRatios(maxImageTiles)
|
|
possibleCanvasSizes := []image.Point{}
|
|
possibleCanvasSizes := []image.Point{}
|
|
for _, pta := range possibleTileArrangements {
|
|
for _, pta := range possibleTileArrangements {
|
|
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
|
possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
|
|
@@ -113,41 +94,29 @@ func getOptimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) i
|
|
return selectedCanvas
|
|
return selectedCanvas
|
|
}
|
|
}
|
|
|
|
|
|
-func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
|
|
|
- b := img.Bounds()
|
|
|
|
- width := b.Max.X - b.Min.X
|
|
|
|
- height := b.Max.Y - b.Min.Y
|
|
|
|
- tileHeight := height / numTilesSize.Y
|
|
|
|
- tileWidth := width / numTilesSize.X
|
|
|
|
-
|
|
|
|
- images := []image.Image{}
|
|
|
|
-
|
|
|
|
- for h := range numTilesSize.Y {
|
|
|
|
- for w := range numTilesSize.X {
|
|
|
|
- rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
|
|
|
- images = append(images, img.(interface {
|
|
|
|
- SubImage(image.Rectangle) image.Image
|
|
|
|
- }).SubImage(rect))
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+func getImageSizeFitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
|
|
|
|
+ targetWidth := clip(imageSize.X, tileSize, canvasSize.X)
|
|
|
|
+ targetHeight := clip(imageSize.Y, tileSize, canvasSize.Y)
|
|
|
|
|
|
- return images
|
|
|
|
-}
|
|
|
|
|
|
+ scaleWidth := float64(targetWidth) / float64(imageSize.X)
|
|
|
|
+ scaleHeight := float64(targetHeight) / float64(imageSize.Y)
|
|
|
|
|
|
-// remove the "alpha" channel by drawing over a prefilled image
|
|
|
|
-func compositeImage(img image.Image) image.Image {
|
|
|
|
- dst := image.NewRGBA(img.Bounds())
|
|
|
|
|
|
+ var w, h int
|
|
|
|
|
|
- white := color.RGBA{255, 255, 255, 255}
|
|
|
|
- draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
|
|
|
|
- draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
|
|
|
|
|
|
+ if scaleWidth < scaleHeight {
|
|
|
|
+ w = targetWidth
|
|
|
|
+ h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
|
|
|
|
+ } else {
|
|
|
|
+ w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
|
|
|
|
+ h = targetHeight
|
|
|
|
+ }
|
|
|
|
|
|
- return dst
|
|
|
|
|
|
+ return image.Point{w, h}
|
|
}
|
|
}
|
|
|
|
|
|
-func ResizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
|
|
|
|
|
+func resizeImage(img image.Image, format string, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
|
|
if format == "png" {
|
|
if format == "png" {
|
|
- img = compositeImage(img)
|
|
|
|
|
|
+ img = imageproc.Composite(img)
|
|
}
|
|
}
|
|
|
|
|
|
b := img.Bounds()
|
|
b := img.Bounds()
|
|
@@ -157,19 +126,10 @@ func ResizeImage(img image.Image, format string, outputSize image.Point, maxImag
|
|
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
|
aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
|
|
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
|
newSize := getImageSizeFitToCanvas(b.Max, canvasSize, tileSize)
|
|
|
|
|
|
- dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
|
|
|
|
-
|
|
|
|
- // scaling choices:
|
|
|
|
- // NearestNeighbor fast, blocky output
|
|
|
|
- // ApproxBiLinear fast, medium quality
|
|
|
|
- // BiLinear slow, high quality
|
|
|
|
- // CatmullRom very slow, very high quality
|
|
|
|
- draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
|
|
|
|
-
|
|
|
|
- return dst, aspectRatio
|
|
|
|
|
|
+ return imageproc.Resize(img, newSize, imageproc.ResizeBilinear), aspectRatio
|
|
}
|
|
}
|
|
|
|
|
|
-func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
|
|
|
|
|
+func padImage(img image.Image, outputSize, aspectRatio image.Point) image.Image {
|
|
paddedSize := image.Point{
|
|
paddedSize := image.Point{
|
|
X: outputSize.X * aspectRatio.X,
|
|
X: outputSize.X * aspectRatio.X,
|
|
Y: outputSize.Y * aspectRatio.Y,
|
|
Y: outputSize.Y * aspectRatio.Y,
|
|
@@ -181,60 +141,61 @@ func PadImage(img image.Image, outputSize, aspectRatio image.Point) image.Image
|
|
return dst
|
|
return dst
|
|
}
|
|
}
|
|
|
|
|
|
-func PackImages(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
|
|
|
|
|
|
+func splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
|
|
|
|
+ b := img.Bounds()
|
|
|
|
+ width := b.Max.X - b.Min.X
|
|
|
|
+ height := b.Max.Y - b.Min.Y
|
|
|
|
+ tileHeight := height / numTilesSize.Y
|
|
|
|
+ tileWidth := width / numTilesSize.X
|
|
|
|
+
|
|
|
|
+ images := []image.Image{}
|
|
|
|
+
|
|
|
|
+ for h := range numTilesSize.Y {
|
|
|
|
+ for w := range numTilesSize.X {
|
|
|
|
+ rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
|
|
|
|
+ images = append(images, img.(interface {
|
|
|
|
+ SubImage(image.Rectangle) image.Image
|
|
|
|
+ }).SubImage(rect))
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return images
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+func packImages(img image.Image, aspectRatio image.Point) []float32 {
|
|
subImages := splitToTiles(img, aspectRatio)
|
|
subImages := splitToTiles(img, aspectRatio)
|
|
|
|
|
|
var pixelVals []float32
|
|
var pixelVals []float32
|
|
|
|
|
|
|
|
+ rescale := true
|
|
|
|
+ channelFirst := true
|
|
|
|
+
|
|
for _, subImg := range subImages {
|
|
for _, subImg := range subImages {
|
|
- bounds := subImg.Bounds()
|
|
|
|
- var rVals, gVals, bVals []float32
|
|
|
|
- for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
|
|
|
|
- for x := bounds.Min.X; x < bounds.Max.X; x++ {
|
|
|
|
- c := subImg.At(x, y)
|
|
|
|
- r, g, b, _ := c.RGBA()
|
|
|
|
- rVal := float32(r>>8) / 255.0
|
|
|
|
- gVal := float32(g>>8) / 255.0
|
|
|
|
- bVal := float32(b>>8) / 255.0
|
|
|
|
-
|
|
|
|
- rVal = (rVal - mean[0]) / std[0]
|
|
|
|
- gVal = (gVal - mean[1]) / std[1]
|
|
|
|
- bVal = (bVal - mean[2]) / std[2]
|
|
|
|
-
|
|
|
|
- rVals = append(rVals, rVal)
|
|
|
|
- gVals = append(gVals, gVal)
|
|
|
|
- bVals = append(bVals, bVal)
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
- pixelVals = append(pixelVals, rVals...)
|
|
|
|
- pixelVals = append(pixelVals, gVals...)
|
|
|
|
- pixelVals = append(pixelVals, bVals...)
|
|
|
|
|
|
+ vals := imageproc.Normalize(subImg, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, rescale, channelFirst)
|
|
|
|
+ pixelVals = append(pixelVals, vals...)
|
|
}
|
|
}
|
|
|
|
|
|
return pixelVals
|
|
return pixelVals
|
|
}
|
|
}
|
|
|
|
|
|
-func Preprocess(imageData []byte) ([]float32, int, error) {
|
|
|
|
- // todo: need guard in here for bad image data
|
|
|
|
-
|
|
|
|
- // mllama values
|
|
|
|
|
|
+func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
|
|
outputSize := image.Point{560, 560}
|
|
outputSize := image.Point{560, 560}
|
|
maxTiles := 4
|
|
maxTiles := 4
|
|
|
|
|
|
- // clip values
|
|
|
|
- mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
|
|
|
|
- std := [3]float32{0.26862954, 0.26130258, 0.27577711}
|
|
|
|
-
|
|
|
|
- img, format, err := image.Decode(bytes.NewReader(imageData))
|
|
|
|
|
|
+ img, format, err := image.Decode(imageData)
|
|
if err != nil {
|
|
if err != nil {
|
|
- return nil, 0, fmt.Errorf("failed to decode image: %w", err)
|
|
|
|
|
|
+ return nil, nil, fmt.Errorf("failed to decode image: %w", err)
|
|
}
|
|
}
|
|
|
|
|
|
- newImage, aspectRatio := ResizeImage(img, format, outputSize, maxTiles)
|
|
|
|
- newImage = PadImage(newImage, outputSize, aspectRatio)
|
|
|
|
|
|
+ newImage, aspectRatio := resizeImage(img, format, outputSize, maxTiles)
|
|
|
|
+ newImage = padImage(newImage, outputSize, aspectRatio)
|
|
|
|
|
|
- data := PackImages(newImage, aspectRatio, mean, std)
|
|
|
|
- aspectRatioIndex := slices.Index(GetSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
|
|
|
|
|
+ data := packImages(newImage, aspectRatio)
|
|
|
|
+ aspectRatioIndex := slices.Index(getSupportedAspectRatios(maxTiles), aspectRatio) + 1
|
|
|
|
+
|
|
|
|
+ opts := map[string]any{
|
|
|
|
+ "aspectRatioIndex": aspectRatioIndex,
|
|
|
|
+ }
|
|
|
|
|
|
- return data, aspectRatioIndex, nil
|
|
|
|
|
|
+ return data, opts, nil
|
|
}
|
|
}
|