process_image.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. package mllama
  2. import (
  3. "image"
  4. "image/color"
  5. "math"
  6. "slices"
  7. "golang.org/x/image/draw"
  8. "github.com/ollama/ollama/ml"
  9. )
  10. type ImageProcessor struct {
  11. imageSize, numChannels, maxNumTiles int
  12. }
  13. func newImageProcessor(c ml.Config) ImageProcessor {
  14. return ImageProcessor{
  15. imageSize: int(c.Uint("vision.image_size")),
  16. numChannels: int(c.Uint("vision.num_channels")),
  17. maxNumTiles: int(c.Uint("vision.max_num_tiles")),
  18. }
  19. }
  20. func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
  21. ratios := []image.Point{}
  22. for w := range maxTiles {
  23. for h := range maxTiles {
  24. if (w+1)*(h+1) <= maxTiles {
  25. ratios = append(ratios, image.Point{w + 1, h + 1})
  26. }
  27. }
  28. }
  29. return ratios
  30. }
  31. func (p *ImageProcessor) clip(a, a_min, a_max int) int {
  32. if a < a_min {
  33. return a_min
  34. } else if a > a_max {
  35. return a_max
  36. }
  37. return a
  38. }
  39. func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
  40. targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
  41. targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
  42. scaleWidth := float64(targetWidth) / float64(imageSize.X)
  43. scaleHeight := float64(targetHeight) / float64(imageSize.Y)
  44. var w, h int
  45. if scaleWidth < scaleHeight {
  46. w = targetWidth
  47. h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
  48. } else {
  49. w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
  50. h = targetHeight
  51. }
  52. return image.Point{w, h}
  53. }
  54. func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
  55. possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
  56. possibleCanvasSizes := []image.Point{}
  57. for _, pta := range possibleTileArrangements {
  58. possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
  59. }
  60. scales := []float64{}
  61. for _, pcs := range possibleCanvasSizes {
  62. scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
  63. scaleWidth := float64(pcs.X) / float64(imageSize.X)
  64. if scaleWidth > scaleHeight {
  65. scales = append(scales, scaleHeight)
  66. } else {
  67. scales = append(scales, scaleWidth)
  68. }
  69. }
  70. var minUpscale float64
  71. var maxDownscale float64
  72. var upscale bool
  73. for _, s := range scales {
  74. if s > 1.0 {
  75. upscale = true
  76. if minUpscale == 0 {
  77. minUpscale = s
  78. } else {
  79. minUpscale = math.Min(minUpscale, s)
  80. }
  81. } else {
  82. maxDownscale = math.Max(maxDownscale, s)
  83. }
  84. }
  85. selectedScale := maxDownscale
  86. if upscale {
  87. selectedScale = minUpscale
  88. }
  89. var selectedCanvas image.Point
  90. for n, pcs := range possibleCanvasSizes {
  91. if scales[n] == selectedScale {
  92. // choose the smallest possible canvas
  93. if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
  94. selectedCanvas = pcs
  95. } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
  96. selectedCanvas = pcs
  97. }
  98. }
  99. }
  100. return selectedCanvas
  101. }
  102. func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
  103. b := img.Bounds()
  104. width := b.Max.X - b.Min.X
  105. height := b.Max.Y - b.Min.Y
  106. tileHeight := height / numTilesSize.Y
  107. tileWidth := width / numTilesSize.X
  108. images := []image.Image{}
  109. for h := range numTilesSize.Y {
  110. for w := range numTilesSize.X {
  111. rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
  112. images = append(images, img.(interface {
  113. SubImage(image.Rectangle) image.Image
  114. }).SubImage(rect))
  115. }
  116. }
  117. return images
  118. }
  119. // remove the "alpha" channel by drawing over a prefilled image
  120. //
  121. //nolint:unused
  122. func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
  123. dst := image.NewRGBA(img.Bounds())
  124. white := color.RGBA{255, 255, 255, 255}
  125. draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
  126. draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
  127. return dst
  128. }
  129. func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
  130. b := img.Bounds()
  131. tileSize := outputSize.Y
  132. canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
  133. aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
  134. newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
  135. dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
  136. // scaling choices:
  137. // NearestNeighbor fast, blocky output
  138. // ApproxBiLinear fast, medium quality
  139. // BiLinear slow, high quality
  140. // CatmullRom very slow, very high quality
  141. draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
  142. return dst, aspectRatio
  143. }
  144. func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
  145. paddedSize := image.Point{
  146. X: outputSize.X * aspectRatio.X,
  147. Y: outputSize.Y * aspectRatio.Y,
  148. }
  149. dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
  150. draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
  151. return dst
  152. }
  153. func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
  154. subImages := p.splitToTiles(img, aspectRatio)
  155. var pixelVals []float32
  156. for _, subImg := range subImages {
  157. bounds := subImg.Bounds()
  158. var rVals, gVals, bVals []float32
  159. for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
  160. for x := bounds.Min.X; x < bounds.Max.X; x++ {
  161. c := subImg.At(x, y)
  162. r, g, b, _ := c.RGBA()
  163. rVal := float32(r>>8) / 255.0
  164. gVal := float32(g>>8) / 255.0
  165. bVal := float32(b>>8) / 255.0
  166. rVal = (rVal - mean[0]) / std[0]
  167. gVal = (gVal - mean[1]) / std[1]
  168. bVal = (bVal - mean[2]) / std[2]
  169. rVals = append(rVals, rVal)
  170. gVals = append(gVals, gVal)
  171. bVals = append(bVals, bVal)
  172. }
  173. }
  174. pixelVals = append(pixelVals, rVals...)
  175. pixelVals = append(pixelVals, gVals...)
  176. pixelVals = append(pixelVals, bVals...)
  177. }
  178. return pixelVals
  179. }
  180. func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
  181. outputSize := image.Point{p.imageSize, p.imageSize}
  182. // clip values
  183. mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
  184. std := [3]float32{0.26862954, 0.26130258, 0.27577711}
  185. newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
  186. newImage = p.pad(newImage, outputSize, aspectRatio)
  187. data := p.pack(newImage, aspectRatio, mean, std)
  188. aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
  189. return data, aspectRatioIndex, nil
  190. }