imageproc.go 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. package mistral3
  2. import (
  3. "fmt"
  4. "image"
  5. _ "image/jpeg"
  6. _ "image/png"
  7. "io"
  8. "math"
  9. "github.com/ollama/ollama/ml"
  10. "github.com/ollama/ollama/model/imageproc"
  11. )
  12. func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
  13. return image.Point{
  14. (imageSize.X-1)/patchSize.X + 1,
  15. (imageSize.Y-1)/patchSize.Y + 1,
  16. }
  17. }
  18. func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
  19. b := img.Bounds()
  20. le := float64(longestEdge)
  21. ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
  22. newSize := img.Bounds().Max
  23. if ratio > 1.0 {
  24. newSize = image.Point{
  25. int(math.Floor(float64(b.Max.X) / ratio)),
  26. int(math.Floor(float64(b.Max.Y) / ratio)),
  27. }
  28. }
  29. tokens := getNumImageTokens(newSize, patchSize)
  30. return image.Point{
  31. tokens.X * patchSize.X,
  32. tokens.Y * patchSize.Y,
  33. }
  34. }
  35. func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
  36. if format == "png" {
  37. img = imageproc.Composite(img)
  38. }
  39. newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
  40. // todo should be ResizeBicubic, but it doesn't exist
  41. return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
  42. }
  43. func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
  44. img, format, err := image.Decode(imageData)
  45. if err != nil {
  46. return nil, nil, fmt.Errorf("failed to decode image: %w", err)
  47. }
  48. longestEdge := 1024
  49. patchSize := image.Point{16, 16}
  50. img = resizeImage(img, format, longestEdge, patchSize)
  51. data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  52. opts := map[string]any{}
  53. return data, opts, nil
  54. }
  55. type ImageProcessor struct {
  56. imageSize int
  57. patchSize int
  58. numChannels int
  59. longestEdge int
  60. }
  61. func newImageProcessor(c ml.Config) ImageProcessor {
  62. return ImageProcessor{
  63. imageSize: int(c.Uint("vision.image_size", 1540)),
  64. patchSize: int(c.Uint("vision.patch_size", 14)),
  65. numChannels: int(c.Uint("vision.num_channels", 3)),
  66. longestEdge: int(c.Uint("vision.longest_edge", 1024)),
  67. }
  68. }
  69. func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
  70. outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
  71. newImage := imageproc.Composite(img)
  72. newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
  73. data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  74. return data, nil
  75. }