imageproc.go 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. package mistral3
  2. import (
  3. "fmt"
  4. "image"
  5. _ "image/jpeg"
  6. _ "image/png"
  7. "io"
  8. "math"
  9. "github.com/ollama/ollama/ml"
  10. "github.com/ollama/ollama/model/imageproc"
  11. )
  12. func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
  13. return image.Point{
  14. (imageSize.X-1)/patchSize.X + 1,
  15. (imageSize.Y-1)/patchSize.Y + 1,
  16. }
  17. }
  18. func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
  19. b := img.Bounds()
  20. ratio := math.Max(float64(b.Max.Y)/float64(longestEdge), float64(b.Max.X)/float64(longestEdge))
  21. newSize := img.Bounds().Max
  22. if ratio > 1.0 {
  23. newSize = image.Point{
  24. int(math.Floor(float64(b.Max.X) / ratio)),
  25. int(math.Floor(float64(b.Max.Y) / ratio)),
  26. }
  27. }
  28. tokens := getNumImageTokens(newSize, patchSize)
  29. return image.Point{
  30. tokens.X * patchSize.X,
  31. tokens.Y * patchSize.Y,
  32. }
  33. }
  34. func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
  35. if format == "png" {
  36. img = imageproc.Composite(img)
  37. }
  38. newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
  39. // todo should be ResizeBicubic, but it doesn't exist
  40. return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
  41. }
  42. func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
  43. img, format, err := image.Decode(imageData)
  44. if err != nil {
  45. return nil, nil, fmt.Errorf("failed to decode image: %w", err)
  46. }
  47. longestEdge := 1024
  48. patchSize := image.Point{16, 16}
  49. img = resizeImage(img, format, longestEdge, patchSize)
  50. data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  51. opts := map[string]any{}
  52. return data, opts, nil
  53. }
  54. type ImageProcessor struct {
  55. imageSize int
  56. patchSize int
  57. numChannels int
  58. longestEdge int
  59. }
  60. func newImageProcessor(c ml.Config) ImageProcessor {
  61. return ImageProcessor{
  62. imageSize: int(c.Uint("vision.image_size", 1540)),
  63. patchSize: int(c.Uint("vision.patch_size", 14)),
  64. numChannels: int(c.Uint("vision.num_channels", 3)),
  65. longestEdge: int(c.Uint("vision.longest_edge", 1540)),
  66. }
  67. }
  68. func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
  69. outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
  70. newImage := imageproc.Composite(img)
  71. newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
  72. data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  73. return data, nil
  74. }