jmorganca 1 månad sedan
förälder
incheckning
8dd2a81f8c

+ 19 - 48
convert/convert_mistral.go

@@ -116,13 +116,16 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
 
 
 func (p *mistral3Model) Replacements() []string {
 func (p *mistral3Model) Replacements() []string {
 	return []string{
 	return []string{
-		// Text model replacements
-		"model.layers", "blk",
+		"language_model.model.norm", "output_norm",
+		"language_model.model.", "",
+		"language_model.", "",
+		"layers", "blk",
+		"transformer.layers", "blk",
+		"vision_tower", "v",
+		"ln_pre", "encoder_norm",
 		"input_layernorm", "attn_norm",
 		"input_layernorm", "attn_norm",
 		"post_attention_layernorm", "ffn_norm",
 		"post_attention_layernorm", "ffn_norm",
-		"lm_head", "output",
-		"model.embed_tokens.weight", "token_embd.weight",
-		"model.norm.weight", "output_norm.weight",
+		"embed_tokens", "token_embd",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.q_proj", "attn_q",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.k_proj", "attn_k",
 		"self_attn.v_proj", "attn_v",
 		"self_attn.v_proj", "attn_v",
@@ -130,50 +133,18 @@ func (p *mistral3Model) Replacements() []string {
 		"mlp.down_proj", "ffn_down",
 		"mlp.down_proj", "ffn_down",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.gate_proj", "ffn_gate",
 		"mlp.up_proj", "ffn_up",
 		"mlp.up_proj", "ffn_up",
-
-		// Language model replacements
-		"language_model.model.embed_tokens", "token_embd",
-		"language_model.model.layers", "blk",
-		"language_model.model.layers.*.input_layernorm", "attn_norm",
-		"language_model.model.layers.*.self_attn.q_proj", "attn_q",
-		"language_model.model.layers.*.self_attn.k_proj", "attn_k",
-		"language_model.model.layers.*.self_attn.v_proj", "attn_v",
-		"language_model.model.layers.*.self_attn.o_proj", "attn_output",
-		"language_model.model.layers.*.mlp.gate_proj", "ffn_gate",
-		"language_model.model.layers.*.mlp.down_proj", "ffn_down",
-		"language_model.model.layers.*.mlp.up_proj", "ffn_up",
-		"language_model.model.layers.*.post_attention_layernorm", "ffn_norm",
-		"language_model.lm_head", "output",
-		"language_model.model.norm", "output_norm",
-
-		// Vision model replacements - map to shorter prefixes
-		"vision_tower", "v",
+		"attention.q_proj", "attn_q",
+		"attention.k_proj", "attn_k",
+		"attention.v_proj", "attn_v",
+		"attention.o_proj", "attn_output",
+		"attention_norm", "attn_norm",
+		"feed_forward", "mlp",
+		"feed_forward.gate_proj", "ffn_gate",
+		"feed_forward.down_proj", "ffn_down",
+		"feed_forward.up_proj", "ffn_up",
 		"multi_modal_projector", "mm",
 		"multi_modal_projector", "mm",
-
-		// Vision transformer blocks - these should be updated accordingly
-		"vision_tower.transformer.layers", "v.blk",
-		"vision_tower.transformer.layers.*.attention_norm", "v.attn_norm",
-		"vision_tower.transformer.layers.*.attention.q_proj", "v.attn_q",
-		"vision_tower.transformer.layers.*.attention.k_proj", "v.attn_k",
-		"vision_tower.transformer.layers.*.attention.v_proj", "v.attn_v",
-		"vision_tower.transformer.layers.*.attention.o_proj", "v.attn_output",
-		"vision_tower.transformer.layers.*.feed_forward.gate_proj", "v.ffn_gate",
-		"vision_tower.transformer.layers.*.feed_forward.down_proj", "v.ffn_down",
-		"vision_tower.transformer.layers.*.feed_forward.up_proj", "v.ffn_up",
-		"vision_tower.transformer.layers.*.ffn_norm", "v.ffn_norm",
-		"vision_tower.ln_pre", "v.encoder_norm",
-		"vision_tower.patch_conv", "v.patch_conv",
-		"vision_tower.embeddings", "v.embeddings",
-
-		// Alternative vision model paths
-		"vision_model.vision_model.embeddings", "v.embeddings",
-		"vision_model.vision_model", "v",
-		"vision_model.layers", "v.blk",
-
-		// Multimodal projector components
-		"multi_modal_projector.patch_merger", "mm.patch_merger",
-		"multi_modal_projector.norm", "mm.norm",
-		"multi_modal_projector.linear", "mm.projection",
+		"ffn_norm", "ffn_norm",
+		"lm_head", "output",
 	}
 	}
 }
 }
 
 

+ 3 - 0
ml/backend.go

@@ -144,6 +144,9 @@ type Tensor interface {
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
 
 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
+	RoPEMulti(ctx Context, positionIDs, ropeFactors Tensor, ropeDim uint32, sections [4]int, ropeType uint32, base, scale float32) Tensor
+
+	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
 
 
 	Tanh(ctx Context) Tensor
 	Tanh(ctx Context) Tensor
 	GELU(ctx Context) Tensor
 	GELU(ctx Context) Tensor

+ 35 - 0
ml/backend/ggml/ggml.go

@@ -958,6 +958,41 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 	}
 	}
 }
 }
 
 
+func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
+	if ropeFactors == nil {
+		ropeFactors = &Tensor{b: t.b}
+	}
+
+	dequant := t.t
+	if C.ggml_is_quantized(t.t._type) {
+		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
+	}
+
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_rope_multi(
+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
+			C.int(ropeDim),
+			(*C.int)(unsafe.Pointer(&sections[0])),
+			C.int(ropeType),
+			131072, // YaRN n_ctx_train
+			C.float(ropeBase),
+			C.float(ropeScale),
+			0.,  // YaRN ext_factor
+			1.,  // YaRN attn_factor
+			32., // YaRN beta_fast
+			1.,  // YaRN beta_slow
+		),
+	}
+}
+
+func (t *Tensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, weight.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
+	}
+}
+
 func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
 func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 	return &Tensor{
 		b: t.b,
 		b: t.b,

+ 4 - 0
ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m

@@ -2186,6 +2186,10 @@ static void ggml_metal_encode_node(
             } break;
             } break;
         case GGML_OP_MUL_MAT:
         case GGML_OP_MUL_MAT:
             {
             {
+                if (ne00 != ne10) {
+                    printf("mul_mat, ne00: %d, ne01: %d, ne02: %d, ne03: %d, ne10: %d, ne11: %d, ne12: %d, ne13: %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13);
+                }
+
                 GGML_ASSERT(ne00 == ne10);
                 GGML_ASSERT(ne00 == ne10);
 
 
                 GGML_ASSERT(ne12 % ne02 == 0);
                 GGML_ASSERT(ne12 % ne02 == 0);

+ 2 - 6
model/models/mistral3/imageproc.go

@@ -21,8 +21,7 @@ func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
 
 
 func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
 func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
 	b := img.Bounds()
 	b := img.Bounds()
-	le := float64(longestEdge)
-	ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
+	ratio := math.Max(float64(b.Max.Y)/float64(longestEdge), float64(b.Max.X)/float64(longestEdge))
 
 
 	newSize := img.Bounds().Max
 	newSize := img.Bounds().Max
 
 
@@ -80,17 +79,14 @@ func newImageProcessor(c ml.Config) ImageProcessor {
 		imageSize:   int(c.Uint("vision.image_size", 1540)),
 		imageSize:   int(c.Uint("vision.image_size", 1540)),
 		patchSize:   int(c.Uint("vision.patch_size", 14)),
 		patchSize:   int(c.Uint("vision.patch_size", 14)),
 		numChannels: int(c.Uint("vision.num_channels", 3)),
 		numChannels: int(c.Uint("vision.num_channels", 3)),
-		longestEdge: int(c.Uint("vision.longest_edge", 1024)),
+		longestEdge: int(c.Uint("vision.longest_edge", 1540)),
 	}
 	}
 }
 }
 
 
 func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
 func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
 	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
 	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
-
 	newImage := imageproc.Composite(img)
 	newImage := imageproc.Composite(img)
 	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
 	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
-
 	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
 	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
-
 	return data, nil
 	return data, nil
 }
 }

+ 17 - 8
model/models/mistral3/model.go

@@ -2,6 +2,7 @@ package mistral3
 
 
 import (
 import (
 	"bytes"
 	"bytes"
+	"fmt"
 	"image"
 	"image"
 	"slices"
 	"slices"
 
 
@@ -59,19 +60,28 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	// Create tensor from image data
 	// Create tensor from image data
 	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
-		m.ImageProcessor.imageSize,
+
+		// TODO (jmorganca): this should be returned from the
+		// image processor instead of hardcoded
+		1036,
 		m.ImageProcessor.numChannels,
 		m.ImageProcessor.numChannels,
 	)
 	)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
+	fmt.Println("pixelValues", "shape", pixelValues.Shape(), "data", ml.Dump(ctx, pixelValues))
+
 	// Forward pass through vision model
 	// Forward pass through vision model
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 
 
+	// fmt.Println("visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+
 	// Project to text embedding space
 	// Project to text embedding space
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.VisionModel.eps)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.VisionModel.eps)
 
 
+	// fmt.Println("visionOutputs after projector", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+
 	return visionOutputs, nil
 	return visionOutputs, nil
 }
 }
 
 
@@ -85,16 +95,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 			inputMultimodal := inp.Multimodal.(ml.Tensor)
 			inputMultimodal := inp.Multimodal.(ml.Tensor)
 
 
 			// Add special image tokens - using the imageTokenIndex from config
 			// Add special image tokens - using the imageTokenIndex from config
-			result = append(result,
-				input.Input{Token: int32(m.MultiModalProjector.imageTokenIndex)},             // Image token
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // Image data
-			)
-
-			// Add image token placeholders
-			result = append(result, slices.Repeat([]input.Input{{Token: 0}}, inputMultimodal.Dim(1)-1)...)
+			result = append(result, input.Input{Token: 10})                                                       // [IMG]
+			result = append(result, input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}) // image data
+			result = append(result, slices.Repeat([]input.Input{{Token: 10}}, inputMultimodal.Dim(1)-1)...)       // [IMG] placeholders
+			result = append(result, input.Input{Token: 13})                                                       // [IMG_END]
 		}
 		}
 	}
 	}
 
 
+	fmt.Println("post tokenize", "result", result)
+
 	return result, nil
 	return result, nil
 }
 }
 
 

+ 116 - 25
model/models/mistral3/model_vision.go

@@ -1,6 +1,7 @@
 package mistral3
 package mistral3
 
 
 import (
 import (
+	"fmt"
 	"math"
 	"math"
 
 
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml"
@@ -9,31 +10,109 @@ import (
 
 
 var batchSize int = 1
 var batchSize int = 1
 
 
+type PatchMerger struct {
+	MergingLayer *nn.Linear `gguf:"merging_layer"`
+}
+
+func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
+	// TODO: pass these in
+	w := 110
+	h := 74
+	// tokensPerImage := w * h
+	d := visionOutputs.Dim(0)
+
+	// TODO: handle multiple images, this currently assumes one
+	fmt.Println("patchmerger visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+
+	// Reshape to [h, w, hidden_size]
+	imageGrid := visionOutputs.Reshape(ctx, h, w, d)
+	fmt.Println("imageGrid", "shape", imageGrid.Shape(), "data", ml.Dump(ctx, imageGrid))
+
+	// TODO: load from ml.Config
+	spatialMergeSize := 2
+	kernel := ctx.Output().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d, 1)
+	fmt.Println("kernel", "shape", kernel.Shape(), "data", ml.Dump(ctx, kernel))
+
+	patches := kernel.IM2Col(ctx, imageGrid, spatialMergeSize, spatialMergeSize, 0, 0, 1, 1)
+	fmt.Println("patches", "shape", patches.Shape(), "data", ml.Dump(ctx, patches))
+
+	fmt.Println("creating reshaped", d*spatialMergeSize*spatialMergeSize, "x", patches.Dim(1)*patches.Dim(2))
+	reshaped := patches.Reshape(ctx, d*spatialMergeSize*spatialMergeSize, patches.Dim(1)*patches.Dim(2))
+	fmt.Println("reshaped", "shape", reshaped.Shape(), "data", ml.Dump(ctx, reshaped))
+
+	return pm.MergingLayer.Forward(ctx, reshaped)
+}
+
+type MultiModalProjector struct {
+	Norm        *nn.RMSNorm  `gguf:"norm"`
+	Linear1     *nn.Linear   `gguf:"linear_1"`
+	Linear2     *nn.Linear   `gguf:"linear_2"`
+	PatchMerger *PatchMerger `gguf:"patch_merger"`
+
+	spatialMergeSize int
+	imageTokenIndex  int
+	hasBias          bool
+}
+
+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
+	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
+	fmt.Println("visionOutputs after norm", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+	visionOutputs = p.PatchMerger.Forward(ctx, visionOutputs)
+	fmt.Println("visionOutputs after patch merger", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+	visionOutputs = p.Linear1.Forward(ctx, visionOutputs).GELU(ctx)
+	fmt.Println("visionOutputs after linear1 and gelu", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
+	return p.Linear2.Forward(ctx, visionOutputs)
+}
+
+func newMultiModalProjector(c ml.Config) *MultiModalProjector {
+	return &MultiModalProjector{
+		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
+		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
+		hasBias:          c.Bool("mm.projector_bias", false),
+	}
+}
+
 type VisionSelfAttention struct {
 type VisionSelfAttention struct {
-	Query       *nn.Linear `gguf:"attn_q"`
-	Key         *nn.Linear `gguf:"attn_k"`
-	Value       *nn.Linear `gguf:"attn_v"`
-	Output      *nn.Linear `gguf:"attn_output"`
-	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
 }
 }
 
 
 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 	headDim := opts.headDim
 	headDim := opts.headDim
 
 
+	// fmt.Println("sa.Query", "shape", sa.Query.Weight.Shape(), "data", ml.Dump(ctx, sa.Query.Weight))
+
 	query := sa.Query.Forward(ctx, hiddenState)
 	query := sa.Query.Forward(ctx, hiddenState)
 	key := sa.Key.Forward(ctx, hiddenState)
 	key := sa.Key.Forward(ctx, hiddenState)
 	value := sa.Value.Forward(ctx, hiddenState)
 	value := sa.Value.Forward(ctx, hiddenState)
 
 
-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	key = key.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	value = value.Reshape(ctx, headDim, opts.numHeads, batchSize)
+	// fmt.Println("query", "shape", query.Shape(), "data", ml.Dump(ctx, query))
+	// fmt.Println("key", "shape", key.Shape(), "data", ml.Dump(ctx, key))
+	// fmt.Println("value", "shape", value.Shape(), "data", ml.Dump(ctx, value))
 
 
-	ropeType := uint32(0)
-	query = query.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
-	key = key.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
+
+	// fmt.Println("query permute", "shape", query.Shape(), "data", ml.Dump(ctx, query))
+	// fmt.Println("key permute", "shape", key.Shape(), "data", ml.Dump(ctx, key))
+	// fmt.Println("value permute", "shape", value.Shape(), "data", ml.Dump(ctx, value))
+	// fmt.Println("positionIDs", "shape", positionIDs.Shape(), "data", ml.Dump(ctx, positionIDs))
+
+	// Multimodal rope
+	ropeType := uint32(24)
+	query = query.RoPEMulti(ctx, positionIDs, nil, uint32(headDim/2), [4]int{0, headDim / 2, headDim / 2, 0}, ropeType, opts.ropeBase, opts.ropeScale)
+	key = key.RoPEMulti(ctx, positionIDs, nil, uint32(headDim/2), [4]int{0, headDim / 2, headDim / 2, 0}, ropeType, opts.ropeBase, opts.ropeScale)
+
+	// fmt.Println("query rope", "shape", query.Shape(), "data", ml.Dump(ctx, query))
+	// fmt.Println("key rope", "shape", key.Shape(), "data", ml.Dump(ctx, key))
 
 
 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
+	// fmt.Println("attention", "shape", attention.Shape(), "data", ml.Dump(ctx, attention))
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
+	// fmt.Println("attention reshape", "shape", attention.Shape(), "data", ml.Dump(ctx, attention))
 
 
 	return sa.Output.Forward(ctx, attention)
 	return sa.Output.Forward(ctx, attention)
 }
 }
@@ -54,7 +133,7 @@ type VisionEncoderLayer struct {
 	SelfAttention *VisionSelfAttention
 	SelfAttention *VisionSelfAttention
 
 
 	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
 	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
-	MLP     *VisionMLP  `gguf:"mlp"`
+	MLP     *VisionMLP
 }
 }
 
 
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -62,6 +141,7 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml
 
 
 	// self attention
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
+	// fmt.Println("after attention norm", "eps", opts.eps, "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState, ml.DumpOptions{Items: 3, Precision: 6}))
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, opts)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, opts)
 	hiddenState = hiddenState.Add(ctx, residual)
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState
 	residual = hiddenState
@@ -87,25 +167,36 @@ type VisionModelOptions struct {
 
 
 type VisionModel struct {
 type VisionModel struct {
 	PatchEmbedding *nn.Conv2D           `gguf:"patch_conv"`
 	PatchEmbedding *nn.Conv2D           `gguf:"patch_conv"`
-	EncoderNorm    *nn.LayerNorm        `gguf:"encoder_norm"`
+	EncoderNorm    *nn.RMSNorm          `gguf:"encoder_norm"`
 	Layers         []VisionEncoderLayer `gguf:"blk"`
 	Layers         []VisionEncoderLayer `gguf:"blk"`
 
 
 	*VisionModelOptions
 	*VisionModelOptions
 }
 }
 
 
 func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
-	numPatchesH := m.imageSize / m.patchSize
-	numPatchesW := m.imageSize / m.patchSize
+	numPatchesH := pixelValues.Dim(1) / m.patchSize
+	numPatchesW := pixelValues.Dim(0) / m.patchSize
 	numPatches := numPatchesH * numPatchesW
 	numPatches := numPatchesH * numPatchesW
-
 	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
 	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
+	// fmt.Println("after patch embedding", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
+	// fmt.Println("after reshape", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
-
-	// Create position IDs
-	positions := make([]int32, numPatches)
-	for i := range positions {
-		positions[i] = int32(i)
+	// fmt.Println("after permute", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
+
+	// TODO: this seems to have incorrect output?
+	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.VisionModelOptions.eps)
+	// fmt.Println("after norm", "eps", m.VisionModelOptions.eps, "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState, ml.DumpOptions{Items: 3, Precision: 6}))
+
+	// Generate 4D position IDs (time, height, width, extra) for MROPE
+	var positions []int32
+	for h := 0; h < numPatchesH; h++ {
+		for w := 0; w < numPatchesW; w++ {
+			positions = append(positions, 0)        // unused
+			positions = append(positions, int32(h)) // height
+			positions = append(positions, int32(w)) // width
+			positions = append(positions, 0)        // unused
+		}
 	}
 	}
 
 
 	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
 	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
@@ -113,14 +204,14 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		panic(err)
 		panic(err)
 	}
 	}
 
 
-	// Apply encoder normalization
-	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.eps)
+	// fmt.Println("positionIDs", "shape", positionIDs.Shape(), "data", ml.Dump(ctx, positionIDs))
 
 
-	// Process through transformer layers
 	for _, layer := range m.Layers {
 	for _, layer := range m.Layers {
 		hiddenState = layer.Forward(ctx, hiddenState, positionIDs, m.VisionModelOptions)
 		hiddenState = layer.Forward(ctx, hiddenState, positionIDs, m.VisionModelOptions)
 	}
 	}
 
 
+	// fmt.Println("after layers", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
+
 	return hiddenState
 	return hiddenState
 }
 }
 
 
@@ -135,7 +226,7 @@ func newVisionModel(c ml.Config) *VisionModel {
 			imageSize:        int(c.Uint("vision.image_size", 1540)),
 			imageSize:        int(c.Uint("vision.image_size", 1540)),
 			patchSize:        int(c.Uint("vision.patch_size", 14)),
 			patchSize:        int(c.Uint("vision.patch_size", 14)),
 			numChannels:      int(c.Uint("vision.num_channels", 3)),
 			numChannels:      int(c.Uint("vision.num_channels", 3)),
-			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-05),
+			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-5),
 			ropeBase:         c.Float("vision.rope.freq_base", 10000.0),
 			ropeBase:         c.Float("vision.rope.freq_base", 10000.0),
 			ropeScale:        c.Float("vision.rope.freq_scale", 1.0),
 			ropeScale:        c.Float("vision.rope.freq_scale", 1.0),
 		},
 		},

+ 0 - 38
model/models/mistral3/multimodal_proj.go

@@ -1,38 +0,0 @@
-package mistral3
-
-import (
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/ml/nn"
-)
-
-type MultiModalProjector struct {
-	Norm       *nn.RMSNorm `gguf:"norm"`
-	Projection *nn.Linear  `gguf:"projection"`
-
-	spatialMergeSize int
-	imageTokenIndex  int
-	hasBias          bool
-}
-
-func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
-	// Apply normalization
-	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
-
-	// If the spatial merge size is > 1, average pool the patches
-	if p.spatialMergeSize > 1 {
-		// Implementation depends on how the model handles spatial merging
-		// For simplicity, we'll use a spatial pooling approach
-		visionOutputs = visionOutputs.AvgPool2D(ctx, p.spatialMergeSize, p.spatialMergeSize, 0)
-	}
-
-	// Project to text embedding dimension
-	return p.Projection.Forward(ctx, visionOutputs)
-}
-
-func newMultiModalProjector(c ml.Config) *MultiModalProjector {
-	return &MultiModalProjector{
-		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
-		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
-		hasBias:          c.Bool("mm.projector_bias", false),
-	}
-}