imageproc.go 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. package qwen2vl
  2. import (
  3. "fmt"
  4. "image"
  5. _ "image/jpeg"
  6. _ "image/png"
  7. "io"
  8. "math"
  9. "github.com/ollama/ollama/model/imageproc"
  10. )
  11. const (
  12. DefaultFactor = 28
  13. DefaultMinPixels = 56 * 56
  14. DefaultMaxPixels = 14 * 14 * 4 * 1280
  15. )
  16. // smartResize calculates the size of the image to resize to based on the
  17. // factor, minPixels, and maxPixels.
  18. func smartResize(size image.Point, factor, minPixels, maxPixels int) image.Point {
  19. // 1. Both dimensions of size are divisible by factor
  20. // 2. The area of the image is between minPixels and maxPixels
  21. // 3. The aspect ratio of the image is as close to 1:1 as possible
  22. if size.Y < factor || size.X < factor {
  23. panic("image is too small to resize")
  24. } else if max(size.X, size.Y)/min(size.X, size.Y) > 200 {
  25. panic("aspect ratio must be less than 200:1")
  26. }
  27. f := float64(factor)
  28. width := float64(size.X)
  29. height := float64(size.Y)
  30. xBar := math.Round(width/f) * f
  31. yBar := math.Round(height/f) * f
  32. if xBar*yBar > float64(maxPixels) {
  33. beta := math.Sqrt(height * width / float64(maxPixels))
  34. xBar = math.Floor(width/beta/f) * f
  35. yBar = math.Floor(height/beta/f) * f
  36. } else if xBar*yBar < float64(minPixels) {
  37. beta := math.Sqrt(float64(minPixels) / (height * width))
  38. xBar = math.Ceil(width*beta/f) * f
  39. yBar = math.Ceil(height*beta/f) * f
  40. }
  41. return image.Point{int(xBar), int(yBar)}
  42. }
  43. func resizeImage(img image.Image, format string, size image.Point) image.Image {
  44. if format == "png" {
  45. img = imageproc.Composite(img)
  46. }
  47. return imageproc.Resize(img, size, imageproc.ResizeBilinear)
  48. }
  49. func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
  50. img, format, err := image.Decode(imageData)
  51. if err != nil {
  52. return nil, nil, fmt.Errorf("failed to decode image: %w", err)
  53. }
  54. size := smartResize(img.Bounds().Max, DefaultFactor, DefaultMinPixels, DefaultMaxPixels)
  55. img = resizeImage(img, format, size)
  56. data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  57. opts := map[string]any{}
  58. return data, opts, nil
  59. }