process_image.go 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. package mllama
  2. import (
  3. "image"
  4. "image/color"
  5. "math"
  6. "slices"
  7. "golang.org/x/image/draw"
  8. "github.com/ollama/ollama/ml"
  9. )
  10. type ImageProcessor struct {
  11. imageSize, numChannels, maxNumTiles int
  12. }
  13. func newImageProcessor(c ml.Config) ImageProcessor {
  14. return ImageProcessor{
  15. imageSize: int(c.Uint("vision.image_size")),
  16. numChannels: int(c.Uint("vision.num_channels")),
  17. maxNumTiles: int(c.Uint("vision.max_num_tiles")),
  18. }
  19. }
  20. func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
  21. ratios := []image.Point{}
  22. for w := range maxTiles {
  23. for h := range maxTiles {
  24. if (w+1)*(h+1) <= maxTiles {
  25. ratios = append(ratios, image.Point{w + 1, h + 1})
  26. }
  27. }
  28. }
  29. return ratios
  30. }
  31. func (p *ImageProcessor) clip(a, a_min, a_max int) int {
  32. if a < a_min {
  33. return a_min
  34. } else if a > a_max {
  35. return a_max
  36. }
  37. return a
  38. }
  39. func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
  40. targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
  41. targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
  42. scaleWidth := float64(targetWidth) / float64(imageSize.X)
  43. scaleHeight := float64(targetHeight) / float64(imageSize.Y)
  44. var w, h int
  45. if scaleWidth < scaleHeight {
  46. w = targetWidth
  47. h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
  48. } else {
  49. w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
  50. h = targetHeight
  51. }
  52. return image.Point{w, h}
  53. }
  54. func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
  55. possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
  56. possibleCanvasSizes := []image.Point{}
  57. for _, pta := range possibleTileArrangements {
  58. possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
  59. }
  60. scales := []float64{}
  61. for _, pcs := range possibleCanvasSizes {
  62. scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
  63. scaleWidth := float64(pcs.X) / float64(imageSize.X)
  64. if scaleWidth > scaleHeight {
  65. scales = append(scales, scaleHeight)
  66. } else {
  67. scales = append(scales, scaleWidth)
  68. }
  69. }
  70. var minUpscale float64
  71. var maxDownscale float64
  72. var upscale bool
  73. for _, s := range scales {
  74. if s > 1.0 {
  75. upscale = true
  76. if minUpscale == 0 {
  77. minUpscale = s
  78. } else {
  79. minUpscale = math.Min(minUpscale, s)
  80. }
  81. } else {
  82. maxDownscale = math.Max(maxDownscale, s)
  83. }
  84. }
  85. selectedScale := maxDownscale
  86. if upscale {
  87. selectedScale = minUpscale
  88. }
  89. var selectedCanvas image.Point
  90. for n, pcs := range possibleCanvasSizes {
  91. if scales[n] == selectedScale {
  92. // choose the smallest possible canvas
  93. if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
  94. selectedCanvas = pcs
  95. } else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
  96. selectedCanvas = pcs
  97. }
  98. }
  99. }
  100. return selectedCanvas
  101. }
  102. func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
  103. b := img.Bounds()
  104. width := b.Max.X - b.Min.X
  105. height := b.Max.Y - b.Min.Y
  106. tileHeight := height / numTilesSize.Y
  107. tileWidth := width / numTilesSize.X
  108. images := []image.Image{}
  109. for h := range numTilesSize.Y {
  110. for w := range numTilesSize.X {
  111. rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
  112. images = append(images, img.(interface {
  113. SubImage(image.Rectangle) image.Image
  114. }).SubImage(rect))
  115. }
  116. }
  117. return images
  118. }
  119. // remove the "alpha" channel by drawing over a prefilled image
  120. //
  121. // remove the "alpha" channel by drawing over a prefilled image
  122. //
  123. //nolint:unused
  124. func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
  125. dst := image.NewRGBA(img.Bounds())
  126. white := color.RGBA{255, 255, 255, 255}
  127. draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
  128. draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
  129. return dst
  130. }
  131. func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
  132. b := img.Bounds()
  133. tileSize := outputSize.Y
  134. canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
  135. aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
  136. newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
  137. dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
  138. // scaling choices:
  139. // NearestNeighbor fast, blocky output
  140. // ApproxBiLinear fast, medium quality
  141. // BiLinear slow, high quality
  142. // CatmullRom very slow, very high quality
  143. draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
  144. return dst, aspectRatio
  145. }
  146. func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
  147. paddedSize := image.Point{
  148. X: outputSize.X * aspectRatio.X,
  149. Y: outputSize.Y * aspectRatio.Y,
  150. }
  151. dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
  152. draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
  153. return dst
  154. }
  155. func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
  156. subImages := p.splitToTiles(img, aspectRatio)
  157. var pixelVals []float32
  158. for _, subImg := range subImages {
  159. bounds := subImg.Bounds()
  160. var rVals, gVals, bVals []float32
  161. for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
  162. for x := bounds.Min.X; x < bounds.Max.X; x++ {
  163. c := subImg.At(x, y)
  164. r, g, b, _ := c.RGBA()
  165. rVal := float32(r>>8) / 255.0
  166. gVal := float32(g>>8) / 255.0
  167. bVal := float32(b>>8) / 255.0
  168. rVal = (rVal - mean[0]) / std[0]
  169. gVal = (gVal - mean[1]) / std[1]
  170. bVal = (bVal - mean[2]) / std[2]
  171. rVals = append(rVals, rVal)
  172. gVals = append(gVals, gVal)
  173. bVals = append(bVals, bVal)
  174. }
  175. }
  176. pixelVals = append(pixelVals, rVals...)
  177. pixelVals = append(pixelVals, gVals...)
  178. pixelVals = append(pixelVals, bVals...)
  179. }
  180. return pixelVals
  181. }
  182. func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
  183. outputSize := image.Point{p.imageSize, p.imageSize}
  184. // clip values
  185. mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
  186. std := [3]float32{0.26862954, 0.26130258, 0.27577711}
  187. newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
  188. newImage = p.pad(newImage, outputSize, aspectRatio)
  189. data := p.pack(newImage, aspectRatio, mean, std)
  190. aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
  191. return data, aspectRatioIndex, nil
  192. }