1 månad sedan · 8dd2a81f8c
--- a/convert/convert_mistral.go
+++ b/convert/convert_mistral.go
@@ -116,13 +116,16 @@ func (p *mistral3Model) Tensors(ts []Tensor) []ggml.Tensor {
 
															 func (p *mistral3Model) Replacements() []string {
														
 
															 	return []string{
														
 
															-		// Text model replacements
														
 
															-		"model.layers", "blk",
														
 
															+		"language_model.model.norm", "output_norm",
														
 
															+		"language_model.model.", "",
														
 
															+		"language_model.", "",
														
 
															+		"layers", "blk",
														
 
															+		"transformer.layers", "blk",
														
 
															+		"vision_tower", "v",
														
 
															+		"ln_pre", "encoder_norm",
														
 
															 		"input_layernorm", "attn_norm",
														
 
															 		"post_attention_layernorm", "ffn_norm",
														
 
															-		"lm_head", "output",
														
 
															-		"model.embed_tokens.weight", "token_embd.weight",
														
 
															-		"model.norm.weight", "output_norm.weight",
														
 
															+		"embed_tokens", "token_embd",
														
 
															 		"self_attn.q_proj", "attn_q",
														
 
															 		"self_attn.k_proj", "attn_k",
														
 
															 		"self_attn.v_proj", "attn_v",
														
@@ -130,50 +133,18 @@ func (p *mistral3Model) Replacements() []string {
 
															 		"mlp.down_proj", "ffn_down",
														
 
															 		"mlp.gate_proj", "ffn_gate",
														
 
															 		"mlp.up_proj", "ffn_up",
														
 
															-
														
 
															-		// Language model replacements
														
 
															-		"language_model.model.embed_tokens", "token_embd",
														
 
															-		"language_model.model.layers", "blk",
														
 
															-		"language_model.model.layers.*.input_layernorm", "attn_norm",
														
 
															-		"language_model.model.layers.*.self_attn.q_proj", "attn_q",
														
 
															-		"language_model.model.layers.*.self_attn.k_proj", "attn_k",
														
 
															-		"language_model.model.layers.*.self_attn.v_proj", "attn_v",
														
 
															-		"language_model.model.layers.*.self_attn.o_proj", "attn_output",
														
 
															-		"language_model.model.layers.*.mlp.gate_proj", "ffn_gate",
														
 
															-		"language_model.model.layers.*.mlp.down_proj", "ffn_down",
														
 
															-		"language_model.model.layers.*.mlp.up_proj", "ffn_up",
														
 
															-		"language_model.model.layers.*.post_attention_layernorm", "ffn_norm",
														
 
															-		"language_model.lm_head", "output",
														
 
															-		"language_model.model.norm", "output_norm",
														
 
															-
														
 
															-		// Vision model replacements - map to shorter prefixes
														
 
															-		"vision_tower", "v",
														
 
															+		"attention.q_proj", "attn_q",
														
 
															+		"attention.k_proj", "attn_k",
														
 
															+		"attention.v_proj", "attn_v",
														
 
															+		"attention.o_proj", "attn_output",
														
 
															+		"attention_norm", "attn_norm",
														
 
															+		"feed_forward", "mlp",
														
 
															+		"feed_forward.gate_proj", "ffn_gate",
														
 
															+		"feed_forward.down_proj", "ffn_down",
														
 
															+		"feed_forward.up_proj", "ffn_up",
														
 
															 		"multi_modal_projector", "mm",
														
 
															-
														
 
															-		// Vision transformer blocks - these should be updated accordingly
														
 
															-		"vision_tower.transformer.layers", "v.blk",
														
 
															-		"vision_tower.transformer.layers.*.attention_norm", "v.attn_norm",
														
 
															-		"vision_tower.transformer.layers.*.attention.q_proj", "v.attn_q",
														
 
															-		"vision_tower.transformer.layers.*.attention.k_proj", "v.attn_k",
														
 
															-		"vision_tower.transformer.layers.*.attention.v_proj", "v.attn_v",
														
 
															-		"vision_tower.transformer.layers.*.attention.o_proj", "v.attn_output",
														
 
															-		"vision_tower.transformer.layers.*.feed_forward.gate_proj", "v.ffn_gate",
														
 
															-		"vision_tower.transformer.layers.*.feed_forward.down_proj", "v.ffn_down",
														
 
															-		"vision_tower.transformer.layers.*.feed_forward.up_proj", "v.ffn_up",
														
 
															-		"vision_tower.transformer.layers.*.ffn_norm", "v.ffn_norm",
														
 
															-		"vision_tower.ln_pre", "v.encoder_norm",
														
 
															-		"vision_tower.patch_conv", "v.patch_conv",
														
 
															-		"vision_tower.embeddings", "v.embeddings",
														
 
															-
														
 
															-		// Alternative vision model paths
														
 
															-		"vision_model.vision_model.embeddings", "v.embeddings",
														
 
															-		"vision_model.vision_model", "v",
														
 
															-		"vision_model.layers", "v.blk",
														
 
															-
														
 
															-		// Multimodal projector components
														
 
															-		"multi_modal_projector.patch_merger", "mm.patch_merger",
														
 
															-		"multi_modal_projector.norm", "mm.norm",
														
 
															-		"multi_modal_projector.linear", "mm.projection",
														
 
															+		"ffn_norm", "ffn_norm",
														
 
															+		"lm_head", "output",
														
 
															 	}
														
 
															 }
														
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -144,6 +144,9 @@ type Tensor interface {
 
															 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
														
 
															 	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32) Tensor
														
 
															+	RoPEMulti(ctx Context, positionIDs, ropeFactors Tensor, ropeDim uint32, sections [4]int, ropeType uint32, base, scale float32) Tensor
														
 
															+
														
 
															+	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
														
 
															 	Tanh(ctx Context) Tensor
														
 
															 	GELU(ctx Context) Tensor
														
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -958,6 +958,41 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 
															 	}
														
 
															 }
														
 
															+func (t *Tensor) RoPEMulti(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, sections [4]int, ropeType uint32, ropeBase, ropeScale float32) ml.Tensor {
														
 
															+	if ropeFactors == nil {
														
 
															+		ropeFactors = &Tensor{b: t.b}
														
 
															+	}
														
 
															+
														
 
															+	dequant := t.t
														
 
															+	if C.ggml_is_quantized(t.t._type) {
														
 
															+		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
														
 
															+	}
														
 
															+
														
 
															+	return &Tensor{
														
 
															+		b: t.b,
														
 
															+		t: C.ggml_rope_multi(
														
 
															+			ctx.(*Context).ctx, dequant, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
														
 
															+			C.int(ropeDim),
														
 
															+			(*C.int)(unsafe.Pointer(&sections[0])),
														
 
															+			C.int(ropeType),
														
 
															+			131072, // YaRN n_ctx_train
														
 
															+			C.float(ropeBase),
														
 
															+			C.float(ropeScale),
														
 
															+			0.,  // YaRN ext_factor
														
 
															+			1.,  // YaRN attn_factor
														
 
															+			32., // YaRN beta_fast
														
 
															+			1.,  // YaRN beta_slow
														
 
															+		),
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+func (t *Tensor) IM2Col(ctx ml.Context, weight ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
														
 
															+	return &Tensor{
														
 
															+		b: t.b,
														
 
															+		t: C.ggml_im2col(ctx.(*Context).ctx, t.t, weight.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1), true, C.GGML_TYPE_F32),
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
														
 
															 	return &Tensor{
														
 
															 		b: t.b,
														
--- a/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
+++ b/ml/backend/ggml/ggml/src/ggml-metal/ggml-metal.m
@@ -2186,6 +2186,10 @@ static void ggml_metal_encode_node(
 
															             } break;
														
 
															         case GGML_OP_MUL_MAT:
														
 
															             {
														
 
															+                if (ne00 != ne10) {
														
 
															+                    printf("mul_mat, ne00: %d, ne01: %d, ne02: %d, ne03: %d, ne10: %d, ne11: %d, ne12: %d, ne13: %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13);
														
 
															+                }
														
 
															+
														
 
															                 GGML_ASSERT(ne00 == ne10);
														
 
															                 GGML_ASSERT(ne12 % ne02 == 0);
														
--- a/model/models/mistral3/imageproc.go
+++ b/model/models/mistral3/imageproc.go
@@ -21,8 +21,7 @@ func getNumImageTokens(imageSize, patchSize image.Point) image.Point {
 
															 func getResizeOutputImageSize(img image.Image, longestEdge int, patchSize image.Point) image.Point {
														
 
															 	b := img.Bounds()
														
 
															-	le := float64(longestEdge)
														
 
															-	ratio := math.Max(float64(b.Max.Y)/le, float64(b.Max.X)/le)
														
 
															+	ratio := math.Max(float64(b.Max.Y)/float64(longestEdge), float64(b.Max.X)/float64(longestEdge))
														
 
															 	newSize := img.Bounds().Max
														
@@ -80,17 +79,14 @@ func newImageProcessor(c ml.Config) ImageProcessor {
 
															 		imageSize:   int(c.Uint("vision.image_size", 1540)),
														
 
															 		patchSize:   int(c.Uint("vision.patch_size", 14)),
														
 
															 		numChannels: int(c.Uint("vision.num_channels", 3)),
														
 
															-		longestEdge: int(c.Uint("vision.longest_edge", 1024)),
														
 
															+		longestEdge: int(c.Uint("vision.longest_edge", 1540)),
														
 
															 	}
														
 
															 }
														
 
															 func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, error) {
														
 
															 	outputSize := getResizeOutputImageSize(img, p.longestEdge, image.Point{p.patchSize, p.patchSize})
														
 
															-
														
 
															 	newImage := imageproc.Composite(img)
														
 
															 	newImage = imageproc.Resize(newImage, outputSize, imageproc.ResizeBilinear)
														
 
															-
														
 
															 	data := imageproc.Normalize(newImage, imageproc.ClipDefaultMean, imageproc.ClipDefaultSTD, true, true)
														
 
															-
														
 
															 	return data, nil
														
 
															 }
														
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -2,6 +2,7 @@ package mistral3
 
															 import (
														
 
															 	"bytes"
														
 
															+	"fmt"
														
 
															 	"image"
														
 
															 	"slices"
														
@@ -59,19 +60,28 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 
															 	// Create tensor from image data
														
 
															 	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
														
 
															 		m.ImageProcessor.imageSize,
														
 
															-		m.ImageProcessor.imageSize,
														
 
															+
														
 
															+		// TODO (jmorganca): this should be returned from the
														
 
															+		// image processor instead of hardcoded
														
 
															+		1036,
														
 
															 		m.ImageProcessor.numChannels,
														
 
															 	)
														
 
															 	if err != nil {
														
 
															 		return nil, err
														
 
															 	}
														
 
															+	fmt.Println("pixelValues", "shape", pixelValues.Shape(), "data", ml.Dump(ctx, pixelValues))
														
 
															+
														
 
															 	// Forward pass through vision model
														
 
															 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
														
 
															+	// fmt.Println("visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+
														
 
															 	// Project to text embedding space
														
 
															 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.VisionModel.eps)
														
 
															+	// fmt.Println("visionOutputs after projector", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+
														
 
															 	return visionOutputs, nil
														
 
															 }
														
@@ -85,16 +95,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 
															 			inputMultimodal := inp.Multimodal.(ml.Tensor)
														
 
															 			// Add special image tokens - using the imageTokenIndex from config
														
 
															-			result = append(result,
														
 
															-				input.Input{Token: int32(m.MultiModalProjector.imageTokenIndex)},             // Image token
														
 
															-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // Image data
														
 
															-			)
														
 
															-
														
 
															-			// Add image token placeholders
														
 
															-			result = append(result, slices.Repeat([]input.Input{{Token: 0}}, inputMultimodal.Dim(1)-1)...)
														
 
															+			result = append(result, input.Input{Token: 10})                                                       // [IMG]
														
 
															+			result = append(result, input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}) // image data
														
 
															+			result = append(result, slices.Repeat([]input.Input{{Token: 10}}, inputMultimodal.Dim(1)-1)...)       // [IMG] placeholders
														
 
															+			result = append(result, input.Input{Token: 13})                                                       // [IMG_END]
														
 
															 		}
														
 
															 	}
														
 
															+	fmt.Println("post tokenize", "result", result)
														
 
															+
														
 
															 	return result, nil
														
 
															 }
														
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -1,6 +1,7 @@
 
															 package mistral3
														
 
															 import (
														
 
															+	"fmt"
														
 
															 	"math"
														
 
															 	"github.com/ollama/ollama/ml"
														
@@ -9,31 +10,109 @@ import (
 
															 var batchSize int = 1
														
 
															+type PatchMerger struct {
														
 
															+	MergingLayer *nn.Linear `gguf:"merging_layer"`
														
 
															+}
														
 
															+
														
 
															+func (pm *PatchMerger) Forward(ctx ml.Context, visionOutputs ml.Tensor) ml.Tensor {
														
 
															+	// TODO: pass these in
														
 
															+	w := 110
														
 
															+	h := 74
														
 
															+	// tokensPerImage := w * h
														
 
															+	d := visionOutputs.Dim(0)
														
 
															+
														
 
															+	// TODO: handle multiple images, this currently assumes one
														
 
															+	fmt.Println("patchmerger visionOutputs", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+
														
 
															+	// Reshape to [h, w, hidden_size]
														
 
															+	imageGrid := visionOutputs.Reshape(ctx, h, w, d)
														
 
															+	fmt.Println("imageGrid", "shape", imageGrid.Shape(), "data", ml.Dump(ctx, imageGrid))
														
 
															+
														
 
															+	// TODO: load from ml.Config
														
 
															+	spatialMergeSize := 2
														
 
															+	kernel := ctx.Output().Empty(ml.DTypeF32, spatialMergeSize, spatialMergeSize, d, 1)
														
 
															+	fmt.Println("kernel", "shape", kernel.Shape(), "data", ml.Dump(ctx, kernel))
														
 
															+
														
 
															+	patches := kernel.IM2Col(ctx, imageGrid, spatialMergeSize, spatialMergeSize, 0, 0, 1, 1)
														
 
															+	fmt.Println("patches", "shape", patches.Shape(), "data", ml.Dump(ctx, patches))
														
 
															+
														
 
															+	fmt.Println("creating reshaped", d*spatialMergeSize*spatialMergeSize, "x", patches.Dim(1)*patches.Dim(2))
														
 
															+	reshaped := patches.Reshape(ctx, d*spatialMergeSize*spatialMergeSize, patches.Dim(1)*patches.Dim(2))
														
 
															+	fmt.Println("reshaped", "shape", reshaped.Shape(), "data", ml.Dump(ctx, reshaped))
														
 
															+
														
 
															+	return pm.MergingLayer.Forward(ctx, reshaped)
														
 
															+}
														
 
															+
														
 
															+type MultiModalProjector struct {
														
 
															+	Norm        *nn.RMSNorm  `gguf:"norm"`
														
 
															+	Linear1     *nn.Linear   `gguf:"linear_1"`
														
 
															+	Linear2     *nn.Linear   `gguf:"linear_2"`
														
 
															+	PatchMerger *PatchMerger `gguf:"patch_merger"`
														
 
															+
														
 
															+	spatialMergeSize int
														
 
															+	imageTokenIndex  int
														
 
															+	hasBias          bool
														
 
															+}
														
 
															+
														
 
															+func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
														
 
															+	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
														
 
															+	fmt.Println("visionOutputs after norm", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+	visionOutputs = p.PatchMerger.Forward(ctx, visionOutputs)
														
 
															+	fmt.Println("visionOutputs after patch merger", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+	visionOutputs = p.Linear1.Forward(ctx, visionOutputs).GELU(ctx)
														
 
															+	fmt.Println("visionOutputs after linear1 and gelu", "shape", visionOutputs.Shape(), "data", ml.Dump(ctx, visionOutputs))
														
 
															+	return p.Linear2.Forward(ctx, visionOutputs)
														
 
															+}
														
 
															+
														
 
															+func newMultiModalProjector(c ml.Config) *MultiModalProjector {
														
 
															+	return &MultiModalProjector{
														
 
															+		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
														
 
															+		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
														
 
															+		hasBias:          c.Bool("mm.projector_bias", false),
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 type VisionSelfAttention struct {
														
 
															-	Query       *nn.Linear `gguf:"attn_q"`
														
 
															-	Key         *nn.Linear `gguf:"attn_k"`
														
 
															-	Value       *nn.Linear `gguf:"attn_v"`
														
 
															-	Output      *nn.Linear `gguf:"attn_output"`
														
 
															-	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
														
 
															+	Query  *nn.Linear `gguf:"attn_q"`
														
 
															+	Key    *nn.Linear `gguf:"attn_k"`
														
 
															+	Value  *nn.Linear `gguf:"attn_v"`
														
 
															+	Output *nn.Linear `gguf:"attn_output"`
														
 
															 }
														
 
															 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
														
 
															 	headDim := opts.headDim
														
 
															+	// fmt.Println("sa.Query", "shape", sa.Query.Weight.Shape(), "data", ml.Dump(ctx, sa.Query.Weight))
														
 
															+
														
 
															 	query := sa.Query.Forward(ctx, hiddenState)
														
 
															 	key := sa.Key.Forward(ctx, hiddenState)
														
 
															 	value := sa.Value.Forward(ctx, hiddenState)
														
 
															-	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
														
 
															-	key = key.Reshape(ctx, headDim, opts.numHeads, batchSize)
														
 
															-	value = value.Reshape(ctx, headDim, opts.numHeads, batchSize)
														
 
															+	// fmt.Println("query", "shape", query.Shape(), "data", ml.Dump(ctx, query))
														
 
															+	// fmt.Println("key", "shape", key.Shape(), "data", ml.Dump(ctx, key))
														
 
															+	// fmt.Println("value", "shape", value.Shape(), "data", ml.Dump(ctx, value))
														
 
															-	ropeType := uint32(0)
														
 
															-	query = query.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
														
 
															-	key = key.RoPE(ctx, positionIDs, sa.RopeFactors, uint32(headDim), ropeType, opts.ropeBase, opts.ropeScale)
														
 
															+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
														
 
															+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
														
 
															+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
														
 
															+
														
 
															+	// fmt.Println("query permute", "shape", query.Shape(), "data", ml.Dump(ctx, query))
														
 
															+	// fmt.Println("key permute", "shape", key.Shape(), "data", ml.Dump(ctx, key))
														
 
															+	// fmt.Println("value permute", "shape", value.Shape(), "data", ml.Dump(ctx, value))
														
 
															+	// fmt.Println("positionIDs", "shape", positionIDs.Shape(), "data", ml.Dump(ctx, positionIDs))
														
 
															+
														
 
															+	// Multimodal rope
														
 
															+	ropeType := uint32(24)
														
 
															+	query = query.RoPEMulti(ctx, positionIDs, nil, uint32(headDim/2), [4]int{0, headDim / 2, headDim / 2, 0}, ropeType, opts.ropeBase, opts.ropeScale)
														
 
															+	key = key.RoPEMulti(ctx, positionIDs, nil, uint32(headDim/2), [4]int{0, headDim / 2, headDim / 2, 0}, ropeType, opts.ropeBase, opts.ropeScale)
														
 
															+
														
 
															+	// fmt.Println("query rope", "shape", query.Shape(), "data", ml.Dump(ctx, query))
														
 
															+	// fmt.Println("key rope", "shape", key.Shape(), "data", ml.Dump(ctx, key))
														
 
															 	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), nil)
														
 
															+	// fmt.Println("attention", "shape", attention.Shape(), "data", ml.Dump(ctx, attention))
														
 
															 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
														
 
															+	// fmt.Println("attention reshape", "shape", attention.Shape(), "data", ml.Dump(ctx, attention))
														
 
															 	return sa.Output.Forward(ctx, attention)
														
 
															 }
														
@@ -54,7 +133,7 @@ type VisionEncoderLayer struct {
 
															 	SelfAttention *VisionSelfAttention
														
 
															 	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
														
 
															-	MLP     *VisionMLP  `gguf:"mlp"`
														
 
															+	MLP     *VisionMLP
														
 
															 }
														
 
															 func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
														
@@ -62,6 +141,7 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState, positionIDs ml
 
															 	// self attention
														
 
															 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
														
 
															+	// fmt.Println("after attention norm", "eps", opts.eps, "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState, ml.DumpOptions{Items: 3, Precision: 6}))
														
 
															 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, positionIDs, opts)
														
 
															 	hiddenState = hiddenState.Add(ctx, residual)
														
 
															 	residual = hiddenState
														
@@ -87,25 +167,36 @@ type VisionModelOptions struct {
 
															 type VisionModel struct {
														
 
															 	PatchEmbedding *nn.Conv2D           `gguf:"patch_conv"`
														
 
															-	EncoderNorm    *nn.LayerNorm        `gguf:"encoder_norm"`
														
 
															+	EncoderNorm    *nn.RMSNorm          `gguf:"encoder_norm"`
														
 
															 	Layers         []VisionEncoderLayer `gguf:"blk"`
														
 
															 	*VisionModelOptions
														
 
															 }
														
 
															 func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
														
 
															-	numPatchesH := m.imageSize / m.patchSize
														
 
															-	numPatchesW := m.imageSize / m.patchSize
														
 
															+	numPatchesH := pixelValues.Dim(1) / m.patchSize
														
 
															+	numPatchesW := pixelValues.Dim(0) / m.patchSize
														
 
															 	numPatches := numPatchesH * numPatchesW
														
 
															-
														
 
															 	hiddenState := m.PatchEmbedding.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
														
 
															+	// fmt.Println("after patch embedding", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
														
 
															 	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize)
														
 
															+	// fmt.Println("after reshape", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
														
 
															 	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
														
 
															-
														
 
															-	// Create position IDs
														
 
															-	positions := make([]int32, numPatches)
														
 
															-	for i := range positions {
														
 
															-		positions[i] = int32(i)
														
 
															+	// fmt.Println("after permute", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
														
 
															+
														
 
															+	// TODO: this seems to have incorrect output?
														
 
															+	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.VisionModelOptions.eps)
														
 
															+	// fmt.Println("after norm", "eps", m.VisionModelOptions.eps, "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState, ml.DumpOptions{Items: 3, Precision: 6}))
														
 
															+
														
 
															+	// Generate 4D position IDs (time, height, width, extra) for MROPE
														
 
															+	var positions []int32
														
 
															+	for h := 0; h < numPatchesH; h++ {
														
 
															+		for w := 0; w < numPatchesW; w++ {
														
 
															+			positions = append(positions, 0)        // unused
														
 
															+			positions = append(positions, int32(h)) // height
														
 
															+			positions = append(positions, int32(w)) // width
														
 
															+			positions = append(positions, 0)        // unused
														
 
															+		}
														
 
															 	}
														
 
															 	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
														
@@ -113,14 +204,14 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 
															 		panic(err)
														
 
															 	}
														
 
															-	// Apply encoder normalization
														
 
															-	hiddenState = m.EncoderNorm.Forward(ctx, hiddenState, m.eps)
														
 
															+	// fmt.Println("positionIDs", "shape", positionIDs.Shape(), "data", ml.Dump(ctx, positionIDs))
														
 
															-	// Process through transformer layers
														
 
															 	for _, layer := range m.Layers {
														
 
															 		hiddenState = layer.Forward(ctx, hiddenState, positionIDs, m.VisionModelOptions)
														
 
															 	}
														
 
															+	// fmt.Println("after layers", "shape", hiddenState.Shape(), "data", ml.Dump(ctx, hiddenState))
														
 
															+
														
 
															 	return hiddenState
														
 
															 }
														
@@ -135,7 +226,7 @@ func newVisionModel(c ml.Config) *VisionModel {
 
															 			imageSize:        int(c.Uint("vision.image_size", 1540)),
														
 
															 			patchSize:        int(c.Uint("vision.patch_size", 14)),
														
 
															 			numChannels:      int(c.Uint("vision.num_channels", 3)),
														
 
															-			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-05),
														
 
															+			eps:              c.Float("vision.attention.layer_norm_epsilon", 1e-5),
														
 
															 			ropeBase:         c.Float("vision.rope.freq_base", 10000.0),
														
 
															 			ropeScale:        c.Float("vision.rope.freq_scale", 1.0),
														
 
															 		},
														
--- a/model/models/mistral3/multimodal_proj.go
+++ b/model/models/mistral3/multimodal_proj.go
@@ -1,38 +0,0 @@
 
															-package mistral3
														
 
															-
														
 
															-import (
														
 
															-	"github.com/ollama/ollama/ml"
														
 
															-	"github.com/ollama/ollama/ml/nn"
														
 
															-)
														
 
															-
														
 
															-type MultiModalProjector struct {
														
 
															-	Norm       *nn.RMSNorm `gguf:"norm"`
														
 
															-	Projection *nn.Linear  `gguf:"projection"`
														
 
															-
														
 
															-	spatialMergeSize int
														
 
															-	imageTokenIndex  int
														
 
															-	hasBias          bool
														
 
															-}
														
 
															-
														
 
															-func (p *MultiModalProjector) Forward(ctx ml.Context, visionOutputs ml.Tensor, eps float32) ml.Tensor {
														
 
															-	// Apply normalization
														
 
															-	visionOutputs = p.Norm.Forward(ctx, visionOutputs, eps)
														
 
															-
														
 
															-	// If the spatial merge size is > 1, average pool the patches
														
 
															-	if p.spatialMergeSize > 1 {
														
 
															-		// Implementation depends on how the model handles spatial merging
														
 
															-		// For simplicity, we'll use a spatial pooling approach
														
 
															-		visionOutputs = visionOutputs.AvgPool2D(ctx, p.spatialMergeSize, p.spatialMergeSize, 0)
														
 
															-	}
														
 
															-
														
 
															-	// Project to text embedding dimension
														
 
															-	return p.Projection.Forward(ctx, visionOutputs)
														
 
															-}
														
 
															-
														
 
															-func newMultiModalProjector(c ml.Config) *MultiModalProjector {
														
 
															-	return &MultiModalProjector{
														
 
															-		spatialMergeSize: int(c.Uint("spatial_merge_size", 2)),
														
 
															-		imageTokenIndex:  int(c.Uint("image_token_index", 10)),
														
 
															-		hasBias:          c.Bool("mm.projector_bias", false),
														
 
															-	}
														
 
															-}