process_image.go 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. package gemma3
  2. import (
  3. "image"
  4. "github.com/ollama/ollama/ml"
  5. "github.com/ollama/ollama/model/imageproc"
  6. )
  7. type ImageProcessor struct {
  8. imageSize, numChannels int
  9. }
  10. func newImageProcessor(c ml.Config) ImageProcessor {
  11. return ImageProcessor{
  12. imageSize: int(c.Uint("vision.image_size")),
  13. numChannels: int(c.Uint("vision.num_channels")),
  14. }
  15. }
  16. func (p *ImageProcessor) pack(img image.Image, mean, std [3]float32) []float32 {
  17. var pixelVals []float32
  18. bounds := img.Bounds()
  19. var rVals, gVals, bVals []float32
  20. for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
  21. for x := bounds.Min.X; x < bounds.Max.X; x++ {
  22. c := img.At(x, y)
  23. r, g, b, _ := c.RGBA()
  24. rVal := float32(r>>8) / 255.0
  25. gVal := float32(g>>8) / 255.0
  26. bVal := float32(b>>8) / 255.0
  27. rVal = (rVal - mean[0]) / std[0]
  28. gVal = (gVal - mean[1]) / std[1]
  29. bVal = (bVal - mean[2]) / std[2]
  30. rVals = append(rVals, rVal)
  31. gVals = append(gVals, gVal)
  32. bVals = append(bVals, bVal)
  33. }
  34. }
  35. pixelVals = append(pixelVals, rVals...)
  36. pixelVals = append(pixelVals, gVals...)
  37. pixelVals = append(pixelVals, bVals...)
  38. return pixelVals
  39. }
  40. func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
  41. outputSize := image.Point{p.imageSize, p.imageSize}
  42. newImage := imageproc.Composite(img)
  43. newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
  44. data := p.pack(newImage, imageproc.ImageNetStandardMean, imageproc.ImageNetStandardSTD)
  45. return data, nil
  46. }