imageproc.go 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. package pixtral
  2. import (
  3. "fmt"
  4. "image"
  5. _ "image/jpeg"
  6. _ "image/png"
  7. "io"
  8. "math"
  9. "github.com/ollama/ollama/model/imageproc"
  10. )
  11. func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
  12. return image.Point{
  13. (imageSize.X-1)/patchSize.X + 1,
  14. (imageSize.Y-1)/patchSize.Y + 1,
  15. }
  16. }
  17. func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
  18. b := img.Bounds()
  19. le := float64(longestEdge)
  20. ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
  21. newSize := img.Bounds().Max
  22. if ratio > 1.0 {
  23. newSize = image.Point{
  24. int(math.Ceil(float64(b.Max.X) / ratio)),
  25. int(math.Ceil(float64(b.Max.Y) / ratio)),
  26. }
  27. }
  28. tokens := getNumImageTokens(newSize, patchSize)
  29. return image.Point{
  30. tokens.X * patchSize.X,
  31. tokens.Y * patchSize.Y,
  32. }
  33. }
  34. func resizeImage(img image.Image, format string, longestEdge int, patchSize image.Point) image.Image {
  35. if format == "png" {
  36. img = imageproc.Composite(img)
  37. }
  38. newSize := getResizeOutputImageSize(img, longestEdge, patchSize)
  39. // todo should be ResizeBicubic, but it doesn't exist
  40. return imageproc.Resize(img, newSize, imageproc.ResizeBilinear)
  41. }
  42. func Preprocess(imageData io.Reader) ([]float32, map[string]any, error) {
  43. img, format, err := image.Decode(imageData)
  44. if err != nil {
  45. return nil, nil, fmt.Errorf("failed to decode image: %w", err)
  46. }
  47. longestEdge := 1024
  48. patchSize := image.Point{16, 16}
  49. img = resizeImage(img, format, longestEdge, patchSize)
  50. data := imageproc.Normalize(img, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
  51. opts := map[string]any{}
  52. return data, opts, nil
  53. }