3 miesięcy temu · 6a4120143f
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
 
				+package cache
			
 
				+
			
 
				+import (
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+)
			
 
				+
			
 
				+type Options struct {
			
 
				+	Position int
			
 
				+}
			
 
				+
			
 
				+type Cache interface {
			
 
				+	Sub(i int) Cache
			
 
				+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
			
 
				+}
			
 
				+
			
 
				+type Simple struct {
			
 
				+	DType    ml.DType
			
 
				+	Capacity int
			
 
				+
			
 
				+	keys, values []ml.Tensor
			
 
				+}
			
 
				+
			
 
				+func (c *Simple) Sub(i int) Cache {
			
 
				+	if i >= len(c.keys) {
			
 
				+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
			
 
				+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
			
 
				+	}
			
 
				+
			
 
				+	return &Simple{
			
 
				+		keys:     c.keys[i : i+1],
			
 
				+		values:   c.values[i : i+1],
			
 
				+		Capacity: c.Capacity,
			
 
				+		DType:    c.DType,
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
			
 
				+	if c.keys[0] == nil || c.values[0] == nil {
			
 
				+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
			
 
				+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
			
 
				+	}
			
 
				+
			
 
				+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
			
 
				+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
			
 
				+
			
 
				+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
			
 
				+
			
 
				+	key = c.keys[0].View(ctx, 0,
			
 
				+		int(key.Dim(0)), int(key.Stride(1)),
			
 
				+		int(key.Dim(1)), int(key.Stride(2)),
			
 
				+		n,
			
 
				+	)
			
 
				+
			
 
				+	value = c.values[0].View(ctx, 0,
			
 
				+		int(value.Dim(0)), int(value.Stride(1)),
			
 
				+		int(value.Dim(1)), int(value.Stride(2)),
			
 
				+		n,
			
 
				+	)
			
 
				+
			
 
				+	// TODO shift context if necessary
			
 
				+
			
 
				+	return key, value
			
 
				+}
			
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 
				 	"log/slog"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type ModelParameters struct {
			
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 
				 	} `json:"lora_parameters"`
			
 
				 }
			
 
				 
			
 
				-func (ModelParameters) KV(t *Tokenizer) llm.KV {
			
 
				-	kv := llm.KV{
			
 
				+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
			
 
				+	kv := ggml.KV{
			
 
				 		"general.file_type":            uint32(1),
			
 
				 		"general.quantization_version": uint32(2),
			
 
				 		"tokenizer.ggml.pre":           t.Pre,
			
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p AdapterParameters) KV() llm.KV {
			
 
				+func (p AdapterParameters) KV() ggml.KV {
			
 
				 	var alpha float32
			
 
				 	if p.LoraParameters.Alpha == 0 {
			
 
				 		alpha = float32(p.Alpha)
			
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 
				 		alpha = p.LoraParameters.Alpha
			
 
				 	}
			
 
				 
			
 
				-	kv := llm.KV{
			
 
				+	kv := ggml.KV{
			
 
				 		"adapter.lora.alpha": alpha,
			
 
				 		"adapter.type":       "lora",
			
 
				 		"general.file_type":  uint32(1),
			
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
			
 
				-	return llm.WriteGGUF(ws, kv, ts)
			
 
				+func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
			
 
				+	return ggml.WriteGGUF(ws, kv, ts)
			
 
				 }
			
 
				 
			
 
				-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
			
 
				-	return llm.WriteGGUF(ws, kv, ts)
			
 
				+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
			
 
				+	return ggml.WriteGGUF(ws, kv, ts)
			
 
				 }
			
 
				 
			
 
				 type ModelConverter interface {
			
 
				 	// KV maps parameters to LLM key-values
			
 
				-	KV(*Tokenizer) llm.KV
			
 
				+	KV(*Tokenizer) ggml.KV
			
 
				 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
			
 
				-	Tensors([]Tensor) []llm.Tensor
			
 
				+	Tensors([]Tensor) []ggml.Tensor
			
 
				 	// Replacements returns a list of string pairs to replace in tensor names.
			
 
				 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
			
 
				 	Replacements() []string
			
@@ -99,7 +99,7 @@ type ModelConverter interface {
 
				 	// specialTokenTypes returns any special token types the model uses
			
 
				 	specialTokenTypes() []string
			
 
				 	// writeFile writes the model to the provided io.WriteSeeker
			
 
				-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
			
 
				+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
			
 
				 }
			
 
				 
			
 
				 type moreParser interface {
			
@@ -108,17 +108,17 @@ type moreParser interface {
 
				 
			
 
				 type AdapterConverter interface {
			
 
				 	// KV maps parameters to LLM key-values
			
 
				-	KV(llm.KV) llm.KV
			
 
				+	KV(ggml.KV) ggml.KV
			
 
				 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
			
 
				-	Tensors([]Tensor) []llm.Tensor
			
 
				+	Tensors([]Tensor) []ggml.Tensor
			
 
				 	// Replacements returns a list of string pairs to replace in tensor names.
			
 
				 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
			
 
				 	Replacements() []string
			
 
				 
			
 
				-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
			
 
				+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
			
 
				 }
			
 
				 
			
 
				-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
			
 
				+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
			
 
				 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
			
 
				 	if err != nil {
			
 
				 		return err
			
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 
				 	"slices"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type bertModel struct {
			
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-func (p *bertModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "bert"
			
 
				 	kv["bert.attention.causal"] = false
			
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		if slices.Contains([]string{
			
 
				 			"embeddings.position_ids",
			
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -3,7 +3,7 @@ package convert
 
				 import (
			
 
				 	"cmp"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type commandrModel struct {
			
@@ -24,7 +24,7 @@ type commandrModel struct {
 
				 
			
 
				 var _ ModelConverter = (*commandrModel)(nil)
			
 
				 
			
 
				-func (p *commandrModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "command-r"
			
 
				 	kv["general.name"] = "command-r"
			
@@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type gemmaModel struct {
			
@@ -23,7 +23,7 @@ type gemmaModel struct {
 
				 
			
 
				 var _ ModelConverter = (*gemmaModel)(nil)
			
 
				 
			
 
				-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "gemma"
			
 
				 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
			
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		if strings.HasSuffix(t.Name(), "_norm.weight") {
			
 
				 			t.SetRepacker(p.addOne)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,8 +1,6 @@
 
				 package convert
			
 
				 
			
 
				-import (
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				-)
			
 
				+import "github.com/ollama/ollama/fs/ggml"
			
 
				 
			
 
				 type gemma2Model struct {
			
 
				 	gemmaModel
			
@@ -11,7 +9,7 @@ type gemma2Model struct {
 
				 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
			
 
				 }
			
 
				 
			
 
				-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "gemma2"
			
 
				 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
			
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type gemma2Adapter struct {
			
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
 
				 
			
 
				 var _ AdapterConverter = (*gemma2Adapter)(nil)
			
 
				 
			
 
				-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
			
 
				+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
			
 
				 	kv := p.AdapterParameters.KV()
			
 
				 	kv["general.architecture"] = "gemma2"
			
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		shape := t.Shape()
			
 
				 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
			
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type llamaModel struct {
			
@@ -46,7 +46,7 @@ type llamaModel struct {
 
				 
			
 
				 var _ ModelConverter = (*llamaModel)(nil)
			
 
				 
			
 
				-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "llama"
			
 
				 	kv["llama.vocab_size"] = p.VocabSize
			
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 
			
 
				 	if p.RopeScaling.factors != nil {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     "rope_freqs.weight",
			
 
				 			Kind:     0,
			
 
				 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
			
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type llamaAdapter struct {
			
@@ -18,7 +18,7 @@ type llamaAdapter struct {
 
				 
			
 
				 var _ AdapterConverter = (*llamaAdapter)(nil)
			
 
				 
			
 
				-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
			
 
				+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
			
 
				 	kv := p.AdapterParameters.KV()
			
 
				 	kv["general.architecture"] = "llama"
			
 
				 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
			
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		shape := t.Shape()
			
 
				 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
			
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    shape,
			
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 
				 	"slices"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type mixtralModel struct {
			
@@ -15,7 +15,7 @@ type mixtralModel struct {
 
				 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
			
 
				 }
			
 
				 
			
 
				-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.llamaModel.KV(t)
			
 
				 
			
 
				 	if p.NumLocalExperts > 0 {
			
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				 	oldnew := []string{
			
 
				 		"model.layers", "blk",
			
 
				 		"w1", "ffn_gate_exps",
			
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 		return true
			
 
				 	})
			
 
				 
			
 
				-	var out []llm.Tensor
			
 
				+	var out []ggml.Tensor
			
 
				 	for n, e := range experts {
			
 
				 		// TODO(mxyng): sanity check experts
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     n,
			
 
				 			Kind:     e[0].Kind(),
			
 
				 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
			
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 
				 	"strings"
			
 
				 	"sync"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type phi3Model struct {
			
@@ -37,7 +37,7 @@ type phi3Model struct {
 
				 
			
 
				 var _ ModelConverter = (*phi3Model)(nil)
			
 
				 
			
 
				-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "phi3"
			
 
				 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
			
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
			
 
				+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				 	var addRopeFactors sync.Once
			
 
				 
			
 
				-	out := make([]llm.Tensor, 0, len(ts)+2)
			
 
				+	out := make([]ggml.Tensor, 0, len(ts)+2)
			
 
				 	for _, t := range ts {
			
 
				 		if strings.HasPrefix(t.Name(), "blk.0.") {
			
 
				 			addRopeFactors.Do(func() {
			
 
				-				out = append(out, llm.Tensor{
			
 
				+				out = append(out, ggml.Tensor{
			
 
				 					Name:     "rope_factors_long.weight",
			
 
				 					Kind:     0,
			
 
				 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
			
 
				 					WriterTo: p.RopeScaling.LongFactor,
			
 
				-				}, llm.Tensor{
			
 
				+				}, ggml.Tensor{
			
 
				 					Name:     "rope_factors_short.weight",
			
 
				 					Kind:     0,
			
 
				 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
			
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 
				 			})
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -1,6 +1,7 @@
 
				 package convert
			
 
				 
			
 
				-import "github.com/ollama/ollama/llm"
			
 
				+import "github.com/ollama/ollama/fs/ggml"
			
 
				+
			
 
				 
			
 
				 type qwen2Model struct {
			
 
				 	ModelParameters
			
@@ -21,7 +22,7 @@ type qwen2Model struct {
 
				 
			
 
				 var _ ModelConverter = (*qwen2Model)(nil)
			
 
				 
			
 
				-func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := q.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "qwen2"
			
 
				 	kv["qwen2.block_count"] = q.HiddenLayers
			
@@ -45,10 +46,10 @@ func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (q *qwen2Model) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (
 
				 
			
 
				 	"golang.org/x/exp/maps"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type tensorData struct {
			
@@ -29,7 +29,7 @@ type tensorData struct {
 
				 	Shape   []int  `json:"shape"`
			
 
				 }
			
 
				 
			
 
				-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
			
 
				+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
			
 
				 	t.Helper()
			
 
				 
			
 
				 	f, err := os.CreateTemp(t.TempDir(), "f16")
			
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 
				 	}
			
 
				 	t.Cleanup(func() { r.Close() })
			
 
				 
			
 
				-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
			
 
				+	m, _, err := ggml.Decode(r, math.MaxInt)
			
 
				 	if err != nil {
			
 
				 		t.Fatal(err)
			
 
				 	}
			
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 
				 	return r, m.KV(), m.Tensors()
			
 
				 }
			
 
				 
			
 
				-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
			
 
				+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
			
 
				 	actual := make(map[string]string)
			
 
				 	for k, v := range kv {
			
 
				 		if s, ok := v.(json.Marshaler); !ok {
			
@@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tenso
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for _, tensor := range tensors.Items {
			
 
				+	for _, tensor := range tensors.Items() {
			
 
				 		sha256sum := sha256.New()
			
 
				 		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
			
 
				 		if _, err := io.Copy(sha256sum, sr); err != nil {
			
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 
				 			}
			
 
				 			defer r.Close()
			
 
				 
			
 
				-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
			
 
				+			m, _, err := ggml.Decode(r, math.MaxInt)
			
 
				 			if err != nil {
			
 
				 				t.Fatal(err)
			
 
				 			}
			
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,15 +1,15 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import (
			
 
				 	"encoding/binary"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				+	"log/slog"
			
 
				 	"slices"
			
 
				 	"strings"
			
 
				-	"sync"
			
 
				 
			
 
				-	"github.com/ollama/ollama/util/bufioutil"
			
 
				+	"github.com/ollama/ollama/fs/util/bufioutil"
			
 
				 )
			
 
				 
			
 
				 type GGML struct {
			
@@ -19,145 +19,168 @@ type GGML struct {
 
				 
			
 
				 type model interface {
			
 
				 	KV() KV
			
 
				-	Tensors() *Tensors
			
 
				+	Tensors() Tensors
			
 
				 }
			
 
				 
			
 
				 type KV map[string]any
			
 
				 
			
 
				-func (kv KV) u64(key string) uint64 {
			
 
				-	switch v := kv[key].(type) {
			
 
				-	case uint64:
			
 
				-		return v
			
 
				-	case uint32:
			
 
				-		return uint64(v)
			
 
				-	case float64:
			
 
				-		return uint64(v)
			
 
				-	default:
			
 
				-		return 0
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 func (kv KV) Architecture() string {
			
 
				-	if s, ok := kv["general.architecture"].(string); ok {
			
 
				-		return s
			
 
				-	}
			
 
				-
			
 
				-	return "unknown"
			
 
				+	return kv.String("general.architecture", "unknown")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) Kind() string {
			
 
				-	if s, ok := kv["general.type"].(string); ok {
			
 
				-		return s
			
 
				-	}
			
 
				-
			
 
				-	return "unknown"
			
 
				+	return kv.String("general.type", "unknown")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) ParameterCount() uint64 {
			
 
				-	return kv.u64("general.parameter_count")
			
 
				+	return keyValue[uint64](kv, "general.parameter_count")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) FileType() fileType {
			
 
				-	if u64 := kv.u64("general.file_type"); u64 > 0 {
			
 
				-		return fileType(uint32(u64))
			
 
				+	if t := kv.Uint("general.file_type"); t > 0 {
			
 
				+		return fileType(t)
			
 
				 	}
			
 
				 
			
 
				 	return fileTypeUnknown
			
 
				 }
			
 
				 
			
 
				 func (kv KV) BlockCount() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("block_count"))
			
 
				+}
			
 
				+
			
 
				+func (kv KV) EmbeddingLength() uint64 {
			
 
				+	return uint64(kv.Uint("embedding_length"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) HeadCount() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("attention.head_count"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) HeadCountKV() uint64 {
			
 
				-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
			
 
				-		return headCountKV
			
 
				-	}
			
 
				-
			
 
				-	return 1
			
 
				+	return uint64(kv.Uint("attention.head_count_kv", 1))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCount() uint64 {
			
 
				 	if heads := kv.HeadCount(); heads > 0 {
			
 
				-		return kv.EmbeddingLength() / kv.HeadCount()
			
 
				+		return kv.EmbeddingLength() / heads
			
 
				 	}
			
 
				 
			
 
				 	return 0
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCountK() uint64 {
			
 
				-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
			
 
				-		return k
			
 
				-	}
			
 
				-
			
 
				-	return kv.EmbeddingHeadCount()
			
 
				+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCountV() uint64 {
			
 
				-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
			
 
				-		return v
			
 
				-	}
			
 
				-
			
 
				-	return kv.EmbeddingHeadCount()
			
 
				+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) GQA() uint64 {
			
 
				 	return kv.HeadCount() / kv.HeadCountKV()
			
 
				 }
			
 
				 
			
 
				-func (kv KV) EmbeddingLength() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
			
 
				-}
			
 
				-
			
 
				 func (kv KV) ContextLength() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("context_length"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) ChatTemplate() string {
			
 
				-	s, _ := kv["tokenizer.chat_template"].(string)
			
 
				+	return kv.String("tokenizer.chat_template")
			
 
				+}
			
 
				+
			
 
				+func (kv KV) String(key string, defaultValue ...string) string {
			
 
				+	return keyValue(kv, key, append(defaultValue, "")...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
			
 
				+	return keyValue(kv, key, append(defaultValue, 0)...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Float(key string, defaultValue ...float32) float32 {
			
 
				+	return keyValue(kv, key, append(defaultValue, 0)...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
			
 
				+	r := keyValue(kv, key, &array{})
			
 
				+	s := make([]string, r.size)
			
 
				+	for i := range r.size {
			
 
				+		s[i] = r.values[i].(string)
			
 
				+	}
			
 
				+
			
 
				+	return s
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
			
 
				+	r := keyValue(kv, key, &array{})
			
 
				+	s := make([]uint32, r.size)
			
 
				+	for i := range r.size {
			
 
				+		s[i] = uint32(r.values[i].(int32))
			
 
				+	}
			
 
				+
			
 
				 	return s
			
 
				 }
			
 
				 
			
 
				+func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
			
 
				+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
			
 
				+		key = kv.Architecture() + "." + key
			
 
				+	}
			
 
				+
			
 
				+	if val, ok := kv[key]; ok {
			
 
				+		return val.(T)
			
 
				+	}
			
 
				+
			
 
				+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
			
 
				+	return defaultValue[0]
			
 
				+}
			
 
				+
			
 
				 type Tensors struct {
			
 
				-	Items  []*Tensor
			
 
				+	items  []*Tensor
			
 
				 	Offset uint64
			
 
				+}
			
 
				 
			
 
				-	layers     map[string]Layer
			
 
				-	layersOnce sync.Once
			
 
				-}
			
 
				-
			
 
				-func (ts *Tensors) Layers() map[string]Layer {
			
 
				-	ts.layersOnce.Do(func() {
			
 
				-		ts.layers = make(map[string]Layer)
			
 
				-		for _, t := range ts.Items {
			
 
				-			parts := strings.Split(t.Name, ".")
			
 
				-			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			
 
				-				if len(parts) > index+2 {
			
 
				-					// blk and mm should have a number after them, join it
			
 
				-					parts = append(
			
 
				-						[]string{strings.Join(parts[:index+2], ".")},
			
 
				-						parts[index+2:]...)
			
 
				-				}
			
 
				-			}
			
 
				+func (s Tensors) Items(prefix ...string) []*Tensor {
			
 
				+	if len(prefix) == 0 {
			
 
				+		return s.items
			
 
				+	}
			
 
				 
			
 
				-			if _, ok := ts.layers[parts[0]]; !ok {
			
 
				-				ts.layers[parts[0]] = make(Layer)
			
 
				-			}
			
 
				+	var items []*Tensor
			
 
				+	for _, t := range s.items {
			
 
				+		if strings.HasPrefix(t.Name, prefix[0]) {
			
 
				+			items = append(items, t)
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
			
 
				+	return items
			
 
				+}
			
 
				+
			
 
				+func (ts Tensors) Layers() map[string]Layer {
			
 
				+	layers := make(map[string]Layer)
			
 
				+	for _, t := range ts.items {
			
 
				+		parts := strings.Split(t.Name, ".")
			
 
				+		if i := slices.Index(parts, "blk"); i > 0 {
			
 
				+			parts = append([]string{
			
 
				+				strings.Join(parts[:i], "."),
			
 
				+				strings.Join(parts[i:i+2], "."),
			
 
				+			}, parts[i+2:]...)
			
 
				+		} else if i == 0 {
			
 
				+			parts = append([]string{
			
 
				+				strings.Join(parts[i:i+2], "."),
			
 
				+			}, parts[i+2:]...)
			
 
				 		}
			
 
				-	})
			
 
				 
			
 
				-	return ts.layers
			
 
				+		if _, ok := layers[parts[0]]; !ok {
			
 
				+			layers[parts[0]] = make(Layer)
			
 
				+		}
			
 
				+
			
 
				+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
			
 
				+	}
			
 
				+
			
 
				+	return layers
			
 
				 }
			
 
				 
			
 
				 type Layer map[string]*Tensor
			
 
				 
			
 
				-func (l Layer) size() (size uint64) {
			
 
				+func (l Layer) Size() (size uint64) {
			
 
				 	for _, t := range l {
			
 
				 		size += t.Size()
			
 
				 	}
			
@@ -255,8 +278,6 @@ func (t Tensor) typeSize() uint64 {
 
				 		return 8
			
 
				 	case 29: // IQ1_M
			
 
				 		return blockSize/8 + blockSize/16 + blockSize/32
			
 
				-	case 30: // BF16
			
 
				-		return 2
			
 
				 	default:
			
 
				 		return 0
			
 
				 	}
			
@@ -295,7 +316,7 @@ const (
 
				 
			
 
				 var ErrUnsupportedFormat = errors.New("unsupported model format")
			
 
				 
			
 
				-func DetectGGMLType(b []byte) string {
			
 
				+func DetectContentType(b []byte) string {
			
 
				 	switch binary.LittleEndian.Uint32(b[:4]) {
			
 
				 	case FILE_MAGIC_GGML:
			
 
				 		return "ggml"
			
@@ -312,12 +333,12 @@ func DetectGGMLType(b []byte) string {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// DecodeGGML decodes a GGML model from the given reader.
			
 
				+// Decode decodes a GGML model from the given reader.
			
 
				 //
			
 
				 // It collects array values for arrays with a size less than or equal to
			
 
				 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
			
 
				 // the maxArraySize is negative, all arrays are collected.
			
 
				-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
			
 
				+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
			
 
				 	if maxArraySize == 0 {
			
 
				 		maxArraySize = 1024
			
 
				 	}
			
@@ -331,10 +352,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 
				 
			
 
				 	var c container
			
 
				 	switch magic {
			
 
				-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
			
 
				-		return nil, 0, ErrUnsupportedFormat
			
 
				-	case FILE_MAGIC_GGLA:
			
 
				-		c = &containerGGLA{}
			
 
				 	case FILE_MAGIC_GGUF_LE:
			
 
				 		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
			
 
				 	case FILE_MAGIC_GGUF_BE:
			
@@ -530,21 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 }
			
 
				 
			
 
				 // SupportsKVCacheType checks if the requested cache type is supported
			
 
				-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
			
 
				-	return slices.Contains(validKVCacheTypes, cacheType)
			
 
				+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
			
 
				 }
			
 
				 
			
 
				 // SupportsFlashAttention checks if the model supports flash attention
			
 
				-func (ggml GGML) SupportsFlashAttention() bool {
			
 
				-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
			
 
				+func (llm GGML) SupportsFlashAttention() bool {
			
 
				+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
			
 
				 	if isEmbedding {
			
 
				 		return false
			
 
				 	}
			
 
				 
			
 
				 	// Check head counts match and are non-zero
			
 
				-	headCountK := ggml.KV().EmbeddingHeadCountK()
			
 
				-	headCountV := ggml.KV().EmbeddingHeadCountV()
			
 
				+	headCountK := llm.KV().EmbeddingHeadCountK()
			
 
				+	headCountV := llm.KV().EmbeddingHeadCountV()
			
 
				 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
			
 
				 }
			
 
				 
			
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -1,4 +1,4 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import (
			
 
				 	"bytes"
			
@@ -8,10 +8,9 @@ import (
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"log/slog"
			
 
				+	"maps"
			
 
				 	"slices"
			
 
				 	"strings"
			
 
				-
			
 
				-	"golang.org/x/exp/maps"
			
 
				 )
			
 
				 
			
 
				 type containerGGUF struct {
			
@@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
 
				 	return llm.kv
			
 
				 }
			
 
				 
			
 
				-func (llm *gguf) Tensors() *Tensors {
			
 
				-	return &Tensors{
			
 
				-		Items:  llm.tensors,
			
 
				+func (llm *gguf) Tensors() Tensors {
			
 
				+	return Tensors{
			
 
				+		items:  llm.tensors,
			
 
				 		Offset: llm.tensorOffset,
			
 
				 	}
			
 
				 }
			
@@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 
				 		return err
			
 
				 	}
			
 
				 
			
 
				-	keys := maps.Keys(kv)
			
 
				+	keys := slices.Collect(maps.Keys(kv))
			
 
				 	slices.Sort(keys)
			
 
				 
			
 
				 	for _, key := range keys {
			
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -1,4 +1,4 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import "fmt"
			
 
				 
			
@@ -32,10 +32,9 @@ const (
 
				 	fileTypeIQ1_S
			
 
				 	fileTypeIQ4_NL
			
 
				 	fileTypeIQ3_S
			
 
				-	fileTypeIQ3_M
			
 
				 	fileTypeIQ2_S
			
 
				-	fileTypeIQ2_M
			
 
				 	fileTypeIQ4_XS
			
 
				+	fileTypeIQ2_M
			
 
				 	fileTypeIQ1_M
			
 
				 	fileTypeBF16
			
 
				 
			
@@ -94,8 +93,6 @@ func ParseFileType(s string) (fileType, error) {
 
				 		return fileTypeIQ4_NL, nil
			
 
				 	case "IQ3_S":
			
 
				 		return fileTypeIQ3_S, nil
			
 
				-	case "IQ3_M":
			
 
				-		return fileTypeIQ3_M, nil
			
 
				 	case "IQ2_S":
			
 
				 		return fileTypeIQ2_S, nil
			
 
				 	case "IQ4_XS":
			
@@ -163,8 +160,6 @@ func (t fileType) String() string {
 
				 		return "IQ4_NL"
			
 
				 	case fileTypeIQ3_S:
			
 
				 		return "IQ3_S"
			
 
				-	case fileTypeIQ3_M:
			
 
				-		return "IQ3_M"
			
 
				 	case fileTypeIQ2_S:
			
 
				 		return "IQ2_S"
			
 
				 	case fileTypeIQ4_XS:
			
--- a/fs/util/bufioutil/buffer_seeker.go
+++ b/fs/util/bufioutil/buffer_seeker.go
--- a/fs/util/bufioutil/buffer_seeker_test.go
+++ b/fs/util/bufioutil/buffer_seeker_test.go
--- a/llm/ggla.go
+++ b/llm/ggla.go
@@ -1,149 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"encoding/binary"
			
 
				-	"errors"
			
 
				-	"io"
			
 
				-	"slices"
			
 
				-)
			
 
				-
			
 
				-type containerGGLA struct {
			
 
				-	version uint32
			
 
				-}
			
 
				-
			
 
				-func (c *containerGGLA) Name() string {
			
 
				-	return "ggla"
			
 
				-}
			
 
				-
			
 
				-func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
			
 
				-	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				-
			
 
				-	switch c.version {
			
 
				-	case 1:
			
 
				-	default:
			
 
				-		return nil, errors.New("invalid version")
			
 
				-	}
			
 
				-
			
 
				-	model := newGGLA(c)
			
 
				-	err := model.decode(rs)
			
 
				-	return model, err
			
 
				-}
			
 
				-
			
 
				-type ggla struct {
			
 
				-	*containerGGLA
			
 
				-
			
 
				-	kv      KV
			
 
				-	tensors []*Tensor
			
 
				-
			
 
				-	tensorOffset uint64
			
 
				-}
			
 
				-
			
 
				-func newGGLA(container *containerGGLA) *ggla {
			
 
				-	return &ggla{
			
 
				-		containerGGLA: container,
			
 
				-		kv:            make(KV),
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (llm *ggla) KV() KV {
			
 
				-	return llm.kv
			
 
				-}
			
 
				-
			
 
				-func (llm *ggla) Tensors() *Tensors {
			
 
				-	return &Tensors{
			
 
				-		Items:  llm.tensors,
			
 
				-		Offset: llm.tensorOffset,
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
			
 
				-	var r uint32
			
 
				-	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
			
 
				-		return err
			
 
				-	}
			
 
				-	llm.kv["r"] = r
			
 
				-
			
 
				-	var alpha uint32
			
 
				-	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
			
 
				-		return err
			
 
				-	}
			
 
				-	llm.kv["alpha"] = alpha
			
 
				-
			
 
				-	offset, err := rs.Seek(0, io.SeekCurrent)
			
 
				-	if err != nil {
			
 
				-		return err
			
 
				-	}
			
 
				-
			
 
				-	llm.tensorOffset = uint64(offset)
			
 
				-
			
 
				-	for {
			
 
				-		var dims uint32
			
 
				-		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
			
 
				-			if errors.Is(err, io.EOF) {
			
 
				-				return nil
			
 
				-			}
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		defer func() {
			
 
				-			if errors.Is(retErr, io.EOF) {
			
 
				-				retErr = io.ErrUnexpectedEOF
			
 
				-			}
			
 
				-		}()
			
 
				-
			
 
				-		var namesize uint32
			
 
				-		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		var t Tensor
			
 
				-		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		t.Shape = make([]uint64, dims)
			
 
				-		for i := 0; uint32(i) < dims; i++ {
			
 
				-			var shape32 uint32
			
 
				-			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
			
 
				-				return err
			
 
				-			}
			
 
				-
			
 
				-			t.Shape[i] = uint64(shape32)
			
 
				-		}
			
 
				-
			
 
				-		// ggla tensor shape is reversed
			
 
				-		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
			
 
				-		slices.Reverse(t.Shape)
			
 
				-
			
 
				-		name := make([]byte, namesize)
			
 
				-		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		t.Name = string(name)
			
 
				-
			
 
				-		offset, err := rs.Seek(0, io.SeekCurrent)
			
 
				-		if err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		offset, err = rs.Seek(0, io.SeekCurrent)
			
 
				-		if err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		t.Offset = uint64(offset)
			
 
				-
			
 
				-		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
			
 
				-			return err
			
 
				-		}
			
 
				-
			
 
				-		llm.tensors = append(llm.tensors, &t)
			
 
				-	}
			
 
				-}
			
--- a/llm/ggml_test.go
+++ b/llm/ggml_test.go
@@ -1 +0,0 @@
 
				-package llm
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -11,18 +11,19 @@ import (
 
				 	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 // This algorithm looks for a complete fit to determine if we need to unload other models
			
 
				-func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
			
 
				+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
			
 
				 	// Split up the GPUs by type and try them
			
 
				 	var estimatedVRAM uint64
			
 
				 	for _, gpus := range allGpus.ByLibrary() {
			
 
				 		var layerCount int
			
 
				-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		estimate := EstimateGPULayers(gpus, f, projectors, opts)
			
 
				 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
			
 
				 		if opts.NumGPU < 0 {
			
 
				-			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
			
 
				+			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
			
 
				 				return true, estimatedVRAM
			
 
				 			}
			
 
				 		} else {
			
@@ -70,7 +71,7 @@ type MemoryEstimate struct {
 
				 
			
 
				 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
			
 
				 // The GPUs provided must all be the same Library
			
 
				-func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
			
 
				+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
			
 
				 	// Graph size for a partial offload, applies to all GPUs
			
 
				 	var graphPartialOffload uint64
			
 
				 
			
@@ -115,33 +116,31 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 		opts.NumCtx = max(opts.NumCtx, 2048)
			
 
				 	}
			
 
				 
			
 
				-	layers := ggml.Tensors().Layers()
			
 
				+	layers := f.Tensors().Layers()
			
 
				 	// add one layer worth of memory as a buffer
			
 
				 	if blk0, ok := layers["blk.0"]; ok {
			
 
				-		layerSize = blk0.size()
			
 
				+		layerSize = blk0.Size()
			
 
				 	} else {
			
 
				 		slog.Warn("model missing blk.0 layer size")
			
 
				 	}
			
 
				 
			
 
				-	fa := envconfig.FlashAttention() &&
			
 
				-		discover.GetGPUInfo().FlashAttentionSupported() &&
			
 
				-		ggml.SupportsFlashAttention()
			
 
				-
			
 
				 	var kvct string
			
 
				-	if fa {
			
 
				+	if envconfig.FlashAttention() &&
			
 
				+		discover.GetGPUInfo().FlashAttentionSupported() &&
			
 
				+		f.SupportsFlashAttention() {
			
 
				 		requested := strings.ToLower(envconfig.KvCacheType())
			
 
				-		if requested != "" && ggml.SupportsKVCacheType(requested) {
			
 
				+		if requested != "" && f.SupportsKVCacheType(requested) {
			
 
				 			kvct = requested
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
			
 
				+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
			
 
				 
			
 
				 	// KV is proportional to the number of layers
			
 
				-	layerSize += kv / ggml.KV().BlockCount()
			
 
				+	layerSize += kv / f.KV().BlockCount()
			
 
				 
			
 
				 	if graphPartialOffload == 0 {
			
 
				-		graphPartialOffload = ggml.KV().GQA() * kv / 6
			
 
				+		graphPartialOffload = f.KV().GQA() * kv / 6
			
 
				 	}
			
 
				 	if graphFullOffload == 0 {
			
 
				 		graphFullOffload = graphPartialOffload
			
@@ -156,12 +155,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 	}
			
 
				 
			
 
				 	if layer, ok := layers["output_norm"]; ok {
			
 
				-		memoryLayerOutput += layer.size()
			
 
				+		memoryLayerOutput += layer.Size()
			
 
				 	}
			
 
				 	if layer, ok := layers["output"]; ok {
			
 
				-		memoryLayerOutput += layer.size()
			
 
				+		memoryLayerOutput += layer.Size()
			
 
				 	} else if layer, ok := layers["token_embd"]; ok {
			
 
				-		memoryLayerOutput += layer.size()
			
 
				+		memoryLayerOutput += layer.Size()
			
 
				 	}
			
 
				 
			
 
				 	// Output layer handled at the end if we have space
			
@@ -211,11 +210,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 	}
			
 
				 
			
 
				 	// For all the layers, find where they can fit on the GPU(s)
			
 
				-	for i := range int(ggml.KV().BlockCount()) {
			
 
				+	for i := range int(f.KV().BlockCount()) {
			
 
				 		// Some models have inconsistent layer sizes
			
 
				 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			
 
				-			layerSize = blk.size()
			
 
				-			layerSize += kv / ggml.KV().BlockCount()
			
 
				+			layerSize = blk.Size()
			
 
				+			layerSize += kv / f.KV().BlockCount()
			
 
				 		}
			
 
				 		memoryWeights += layerSize
			
 
				 
			
@@ -238,10 +237,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	if layerCount >= int(ggml.KV().BlockCount()) {
			
 
				+	if layerCount >= int(f.KV().BlockCount()) {
			
 
				 		fullyLoaded = true
			
 
				 	} else {
			
 
				-		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
			
 
				+		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
			
 
				 			overflow += layerSize
			
 
				 		}
			
 
				 	}
			
@@ -259,7 +258,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		if layerCount < int(ggml.KV().BlockCount())+1 {
			
 
				+		if layerCount < int(f.KV().BlockCount())+1 {
			
 
				 			fullyLoaded = false
			
 
				 			overflow += memoryLayerOutput
			
 
				 		}
			
@@ -311,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 
			
 
				 		inferenceLibrary:    gpus[0].Library,
			
 
				 		layersRequested:     opts.NumGPU,
			
 
				-		layersModel:         int(ggml.KV().BlockCount()) + 1,
			
 
				+		layersModel:         int(f.KV().BlockCount()) + 1,
			
 
				 		availableList:       availableList,
			
 
				 		kv:                  kv,
			
 
				 		allocationsList:     allocationsList,
			
@@ -339,22 +338,9 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 	return estimate
			
 
				 }
			
 
				 
			
 
				-func (m MemoryEstimate) log() {
			
 
				-	overhead := envconfig.GpuOverhead()
			
 
				-
			
 
				-	log := slog.With()
			
 
				-	if m.projectorWeights > 0 {
			
 
				-		log = log.With(
			
 
				-			slog.Group(
			
 
				-				"projector",
			
 
				-				"weights", format.HumanBytes2(m.projectorWeights),
			
 
				-				"graph", format.HumanBytes2(m.projectorGraph),
			
 
				-			),
			
 
				-		)
			
 
				-	}
			
 
				-
			
 
				-	log.Info(
			
 
				-		"offload to "+m.inferenceLibrary,
			
 
				+func (m MemoryEstimate) LogValue() slog.Value {
			
 
				+	attrs := []slog.Attr{
			
 
				+		slog.String("library", m.inferenceLibrary),
			
 
				 		slog.Group(
			
 
				 			"layers",
			
 
				 			// requested number of layers to offload
			
@@ -370,7 +356,7 @@ func (m MemoryEstimate) log() {
 
				 			"memory",
			
 
				 			// memory available by GPU for offloading
			
 
				 			"available", m.availableList,
			
 
				-			"gpu_overhead", format.HumanBytes2(overhead),
			
 
				+			"gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
			
 
				 			slog.Group(
			
 
				 				"required",
			
 
				 				// memory required for full offloading
			
@@ -399,7 +385,17 @@ func (m MemoryEstimate) log() {
 
				 				"partial", format.HumanBytes2(m.graphPartialOffload),
			
 
				 			),
			
 
				 		),
			
 
				-	)
			
 
				+	}
			
 
				+
			
 
				+	if m.projectorWeights > 0 {
			
 
				+		attrs = append(attrs, slog.Group(
			
 
				+			"projector",
			
 
				+			"weights", format.HumanBytes2(m.projectorWeights),
			
 
				+			"graph", format.HumanBytes2(m.projectorGraph),
			
 
				+		))
			
 
				+	}
			
 
				+
			
 
				+	return slog.GroupValue(attrs...)
			
 
				 }
			
 
				 
			
 
				 func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
			
@@ -409,13 +405,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 
				 	}
			
 
				 	defer file.Close()
			
 
				 
			
 
				-	ggml, _, err := DecodeGGML(file, 0)
			
 
				+	ggml, _, err := ggml.Decode(file, 0)
			
 
				 	if err != nil {
			
 
				 		return 0, 0
			
 
				 	}
			
 
				 
			
 
				 	for _, layer := range ggml.Tensors().Layers() {
			
 
				-		weights += layer.size()
			
 
				+		weights += layer.Size()
			
 
				 	}
			
 
				 
			
 
				 	switch arch := ggml.KV().Architecture(); arch {
			
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -11,6 +11,7 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/discover"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 func TestEstimateGPULayers(t *testing.T) {
			
@@ -23,7 +24,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 	defer f.Close()
			
 
				 	inputLayerCount := 5
			
 
				 
			
 
				-	tensors := []Tensor{
			
 
				+	tensors := []ggml.Tensor{
			
 
				 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
 
				 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
 
				 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
@@ -32,7 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
 
				 	}
			
 
				 	assert.Len(t, tensors, inputLayerCount+1)
			
 
				-	err = WriteGGUF(f, KV{
			
 
				+	err = ggml.WriteGGUF(f, ggml.KV{
			
 
				 		"general.architecture":          "llama",
			
 
				 		"llama.context_length":          uint32(32),
			
 
				 		"llama.embedding_length":        uint32(4096),
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -29,6 +29,7 @@ import (
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				 	"github.com/ollama/ollama/llama"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type LlamaServer interface {
			
@@ -71,7 +72,7 @@ type llmServer struct {
 
				 // It collects array values for arrays with a size less than or equal to
			
 
				 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
			
 
				 // the maxArraySize is negative, all arrays are collected.
			
 
				-func LoadModel(model string, maxArraySize int) (*GGML, error) {
			
 
				+func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
			
 
				 	if _, err := os.Stat(model); err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
@@ -82,22 +83,19 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
				 	}
			
 
				 	defer f.Close()
			
 
				 
			
 
				-	ggml, _, err := DecodeGGML(f, maxArraySize)
			
 
				+	ggml, _, err := ggml.Decode(f, maxArraySize)
			
 
				 	return ggml, err
			
 
				 }
			
 
				 
			
 
				 // NewLlamaServer will run a server for the given GPUs
			
 
				 // The gpu list must be a single family.
			
 
				-func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				+func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				 	var err error
			
 
				-	var systemTotalMemory uint64
			
 
				-	var systemFreeMemory uint64
			
 
				-	var systemSwapFreeMemory uint64
			
 
				 
			
 
				 	systemInfo := discover.GetSystemInfo()
			
 
				-	systemTotalMemory = systemInfo.System.TotalMemory
			
 
				-	systemFreeMemory = systemInfo.System.FreeMemory
			
 
				-	systemSwapFreeMemory = systemInfo.System.FreeSwap
			
 
				+	systemTotalMemory := systemInfo.System.TotalMemory
			
 
				+	systemFreeMemory := systemInfo.System.FreeMemory
			
 
				+	systemSwapFreeMemory := systemInfo.System.FreeSwap
			
 
				 	slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
			
 
				 
			
 
				 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
			
@@ -105,7 +103,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		gpus = discover.GetCPUInfo()
			
 
				 	}
			
 
				 
			
 
				-	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+	estimate := EstimateGPULayers(gpus, f, projectors, opts)
			
 
				 	if len(gpus) > 1 || gpus[0].Library != "cpu" {
			
 
				 		switch {
			
 
				 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
			
@@ -131,7 +129,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	estimate.log()
			
 
				+	slog.Info("offload", "", estimate)
			
 
				 
			
 
				 	params := []string{
			
 
				 		"--model", model,
			
@@ -175,7 +173,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		fa = false
			
 
				 	}
			
 
				 
			
 
				-	if fa && !ggml.SupportsFlashAttention() {
			
 
				+	if fa && !f.SupportsFlashAttention() {
			
 
				 		slog.Warn("flash attention enabled but not supported by model")
			
 
				 		fa = false
			
 
				 	}
			
@@ -188,7 +186,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 
			
 
				 		// Flash Attention also supports kv cache quantization
			
 
				 		// Enable if the requested and kv cache type is supported by the model
			
 
				-		if kvct != "" && ggml.SupportsKVCacheType(kvct) {
			
 
				+		if kvct != "" && f.SupportsKVCacheType(kvct) {
			
 
				 			params = append(params, "--kv-cache-type", kvct)
			
 
				 		} else {
			
 
				 			slog.Warn("kv cache type not supported by model", "type", kvct)
			
@@ -201,7 +199,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 	for _, g := range gpus {
			
 
				 		if g.Library == "metal" &&
			
 
				 			uint64(opts.NumGPU) > 0 &&
			
 
				-			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
			
 
				+			uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
			
 
				 			opts.UseMMap = new(bool)
			
 
				 			*opts.UseMMap = false
			
 
				 		}
			
@@ -341,7 +339,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 			estimate:    estimate,
			
 
				 			numParallel: numParallel,
			
 
				 			sem:         semaphore.NewWeighted(int64(numParallel)),
			
 
				-			totalLayers: ggml.KV().BlockCount() + 1,
			
 
				+			totalLayers: f.KV().BlockCount() + 1,
			
 
				 			gpus:        gpus,
			
 
				 			done:        make(chan error, 1),
			
 
				 		}
			
--- a/ml/backend.go
+++ b/ml/backend.go
@@ -0,0 +1,191 @@
 
				+package ml
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"encoding/binary"
			
 
				+	"fmt"
			
 
				+	"os"
			
 
				+	"strings"
			
 
				+)
			
 
				+
			
 
				+type Config interface {
			
 
				+	Architecture() string
			
 
				+	String(string, ...string) string
			
 
				+	Uint(string, ...uint32) uint32
			
 
				+	Float(string, ...float32) float32
			
 
				+
			
 
				+	Strings(string, ...[]string) []string
			
 
				+	Uints(string, ...[]uint32) []uint32
			
 
				+}
			
 
				+
			
 
				+type Backend interface {
			
 
				+	Config() Config
			
 
				+	Get(name string) Tensor
			
 
				+	NewContext() Context
			
 
				+}
			
 
				+
			
 
				+var backends = make(map[string]func(*os.File) (Backend, error))
			
 
				+
			
 
				+func RegisterBackend(name string, f func(*os.File) (Backend, error)) {
			
 
				+	if _, ok := backends[name]; ok {
			
 
				+		panic("backend: backend already registered")
			
 
				+	}
			
 
				+
			
 
				+	backends[name] = f
			
 
				+}
			
 
				+
			
 
				+func NewBackend(f *os.File) (Backend, error) {
			
 
				+	if backend, ok := backends["ggml"]; ok {
			
 
				+		return backend(f)
			
 
				+	}
			
 
				+
			
 
				+	return nil, fmt.Errorf("unsupported backend")
			
 
				+}
			
 
				+
			
 
				+type Context interface {
			
 
				+	Zeros(dtype DType, shape ...int) Tensor
			
 
				+	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
			
 
				+	FromIntSlice(s []int32, shape ...int) (Tensor, error)
			
 
				+
			
 
				+	Forward(Tensor)
			
 
				+	Compute(Tensor) Tensor
			
 
				+	Close() error
			
 
				+}
			
 
				+
			
 
				+type Tensor interface {
			
 
				+	Dim(n int) int64
			
 
				+	Stride(n int) int64
			
 
				+
			
 
				+	Shape() []int64
			
 
				+	DType() DType
			
 
				+
			
 
				+	Bytes() []byte
			
 
				+	Floats() []float32
			
 
				+
			
 
				+	Add(ctx Context, t2 Tensor) Tensor
			
 
				+	Mul(ctx Context, t2 Tensor) Tensor
			
 
				+	Mulmat(ctx Context, t2 Tensor) Tensor
			
 
				+
			
 
				+	Softmax(ctx Context) Tensor
			
 
				+	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
			
 
				+	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
			
 
				+	Scale(ctx Context, s float64) Tensor
			
 
				+
			
 
				+	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor
			
 
				+	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim uint32, base, scale float32) Tensor
			
 
				+
			
 
				+	Tanh(ctx Context) Tensor
			
 
				+	GELU(ctx Context) Tensor
			
 
				+	SILU(ctx Context) Tensor
			
 
				+
			
 
				+	Reshape(ctx Context, shape ...int64) Tensor
			
 
				+	View(ctx Context, offset int, shape ...int) Tensor
			
 
				+	Permute(ctx Context, shape ...int) Tensor
			
 
				+	Contiguous(ctx Context) Tensor
			
 
				+
			
 
				+	Pad(ctx Context, shape ...int64) Tensor
			
 
				+	Unpad(ctx Context, shape ...int64) Tensor
			
 
				+
			
 
				+	Stack(ctx Context, dim int, s ...Tensor) Tensor
			
 
				+	Concat(ctx Context, t2 Tensor, dim int) Tensor
			
 
				+	Rows(ctx Context, t2 Tensor) Tensor
			
 
				+	Copy(ctx Context, t2 Tensor) Tensor
			
 
				+}
			
 
				+
			
 
				+type number interface {
			
 
				+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
			
 
				+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 |
			
 
				+		~float32 | ~float64 |
			
 
				+		~complex64 | ~complex128
			
 
				+}
			
 
				+
			
 
				+func mul[T number](s ...T) T {
			
 
				+	p := T(1)
			
 
				+	for _, v := range s {
			
 
				+		p *= v
			
 
				+	}
			
 
				+
			
 
				+	return p
			
 
				+}
			
 
				+
			
 
				+type DumpOptions struct {
			
 
				+	// Items is the number of elements to print at the beginning and end of each dimension.
			
 
				+	Items int64
			
 
				+
			
 
				+	// Precision is the number of decimal places to print. Applies to float32 and float64.
			
 
				+	Precision int
			
 
				+}
			
 
				+
			
 
				+func Dump(t Tensor, opts ...DumpOptions) string {
			
 
				+	if len(opts) < 1 {
			
 
				+		opts = append(opts, DumpOptions{
			
 
				+			Items:     3,
			
 
				+			Precision: 4,
			
 
				+		})
			
 
				+	}
			
 
				+
			
 
				+	switch t.DType() {
			
 
				+	case DTypeF32:
			
 
				+		return dump[[]float32](t, opts[0])
			
 
				+	case DTypeI32:
			
 
				+		return dump[[]int32](t, opts[0])
			
 
				+	default:
			
 
				+		return "<unsupported>"
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func dump[S ~[]E, E number](t Tensor, opts DumpOptions) string {
			
 
				+	bts := t.Bytes()
			
 
				+	if bts == nil {
			
 
				+		return "<nil>"
			
 
				+	}
			
 
				+
			
 
				+	s := make(S, mul(t.Shape()...))
			
 
				+	if err := binary.Read(bytes.NewBuffer(t.Bytes()), binary.LittleEndian, &s); err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+
			
 
				+	shape := t.Shape()
			
 
				+
			
 
				+	var sb strings.Builder
			
 
				+	var f func([]int64, int64)
			
 
				+	f = func(dims []int64, stride int64) {
			
 
				+		prefix := strings.Repeat(" ", len(shape)-len(dims)+1)
			
 
				+		fmt.Fprint(&sb, "[")
			
 
				+		defer func() { fmt.Fprint(&sb, "]") }()
			
 
				+		for i := int64(0); i < dims[0]; i++ {
			
 
				+			if i >= opts.Items && i < dims[0]-opts.Items {
			
 
				+				fmt.Fprint(&sb, "..., ")
			
 
				+				// skip to next printable element
			
 
				+				skip := dims[0] - 2*opts.Items
			
 
				+				if len(dims) > 1 {
			
 
				+					stride += mul(append(dims[1:], skip)...)
			
 
				+					fmt.Fprint(&sb, strings.Repeat("\n", len(dims)-1), prefix)
			
 
				+				}
			
 
				+				i += skip - 1
			
 
				+			} else if len(dims) > 1 {
			
 
				+				f(dims[1:], stride)
			
 
				+				stride += mul(dims[1:]...)
			
 
				+				if i < dims[0]-1 {
			
 
				+					fmt.Fprint(&sb, ",", strings.Repeat("\n", len(dims)-1), prefix)
			
 
				+				}
			
 
				+			} else {
			
 
				+				fmt.Fprint(&sb, s[stride+i])
			
 
				+				if i < dims[0]-1 {
			
 
				+					fmt.Fprint(&sb, ", ")
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	f(shape, 0)
			
 
				+
			
 
				+	return sb.String()
			
 
				+}
			
 
				+
			
 
				+type DType int
			
 
				+
			
 
				+const (
			
 
				+	DTypeF32 DType = iota
			
 
				+	DTypeI32
			
 
				+	DTypeOther
			
 
				+)
			
--- a/ml/backend/backend.go
+++ b/ml/backend/backend.go
@@ -0,0 +1,5 @@
 
				+package backend
			
 
				+
			
 
				+import (
			
 
				+	_ "github.com/ollama/ollama/ml/backend/ggml"
			
 
				+)
			
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -0,0 +1,580 @@
 
				+package ggml
			
 
				+
			
 
				+// #cgo CPPFLAGS: -I${SRCDIR}/ggml/include
			
 
				+// #include <stdlib.h>
			
 
				+// #include <stdint.h>
			
 
				+// #include "ggml.h"
			
 
				+// #include "ggml-cpu.h"
			
 
				+// #include "ggml-backend.h"
			
 
				+import "C"
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"encoding/binary"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"log/slog"
			
 
				+	"os"
			
 
				+	"sync"
			
 
				+	"unsafe"
			
 
				+
			
 
				+	"github.com/ollama/ollama/format"
			
 
				+	fs "github.com/ollama/ollama/fs/ggml"
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"golang.org/x/sync/errgroup"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml/backend/ggml/ggml/src"
			
 
				+)
			
 
				+
			
 
				+type device struct {
			
 
				+	d *C.struct_ggml_backend_device
			
 
				+}
			
 
				+
			
 
				+func (d device) LogValue() slog.Value {
			
 
				+	var free, total uint64
			
 
				+	C.ggml_backend_dev_memory(d.d, (*C.size_t)(&free), (*C.size_t)(&total))
			
 
				+
			
 
				+	kind := "unknown"
			
 
				+	switch C.ggml_backend_dev_type(d.d) {
			
 
				+	case C.GGML_BACKEND_DEVICE_TYPE_CPU:
			
 
				+		kind = "cpu"
			
 
				+	case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			
 
				+		kind = "gpu"
			
 
				+	case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			
 
				+		kind = "accel"
			
 
				+	}
			
 
				+
			
 
				+	return slog.GroupValue(
			
 
				+		slog.String("name", C.GoString(C.ggml_backend_dev_name(d.d))),
			
 
				+		slog.String("description", C.GoString(C.ggml_backend_dev_description(d.d))),
			
 
				+		slog.String("kind", kind),
			
 
				+		slog.String("free", format.HumanBytes2(free)),
			
 
				+		slog.String("total", format.HumanBytes2(total)),
			
 
				+	)
			
 
				+}
			
 
				+
			
 
				+var devices = sync.OnceValue(func() []device {
			
 
				+	ggml.OnceLoad()
			
 
				+
			
 
				+	s := make([]device, C.ggml_backend_dev_count())
			
 
				+	for i := range s {
			
 
				+		s[i] = device{C.ggml_backend_dev_get(C.size_t(i))}
			
 
				+	}
			
 
				+
			
 
				+	return s
			
 
				+})
			
 
				+
			
 
				+type Backend struct {
			
 
				+	meta       *fs.GGML
			
 
				+	cpus, gpus []Context
			
 
				+	tensors    map[string]*Context
			
 
				+}
			
 
				+
			
 
				+func New(r *os.File) (ml.Backend, error) {
			
 
				+	meta, n, err := fs.Decode(r, -1)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	slog.Info(
			
 
				+		"",
			
 
				+		"architecture", meta.KV().Architecture(),
			
 
				+		"file_type", meta.KV().FileType(),
			
 
				+		"name", meta.KV().String("general.name"),
			
 
				+		"description", meta.KV().String("general.description"),
			
 
				+		"num_tensors", len(meta.Tensors().Items()),
			
 
				+		"num_key_values", len(meta.KV()),
			
 
				+	)
			
 
				+
			
 
				+	var cpus, gpus []Context
			
 
				+	for _, d := range devices() {
			
 
				+		switch C.ggml_backend_dev_type(d.d) {
			
 
				+		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
			
 
				+			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
			
 
				+			slog.Info("cpu", "device", d)
			
 
				+			cpus = append(cpus, Context{
			
 
				+				ctx: C.ggml_init(C.struct_ggml_init_params{
			
 
				+					mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
			
 
				+					no_alloc: true,
			
 
				+				}),
			
 
				+				backend: C.ggml_backend_dev_init(d.d, nil),
			
 
				+			})
			
 
				+		case C.GGML_BACKEND_DEVICE_TYPE_GPU:
			
 
				+			slog.Info("gpu", "device", d)
			
 
				+			gpus = append(gpus, Context{
			
 
				+				ctx: C.ggml_init(C.struct_ggml_init_params{
			
 
				+					mem_size: C.size_t(int(C.ggml_tensor_overhead()) * (len(meta.Tensors().Items()) + 1 + int(meta.KV().BlockCount())*2)),
			
 
				+					no_alloc: true,
			
 
				+				}),
			
 
				+				backend: C.ggml_backend_dev_init(d.d, nil),
			
 
				+			})
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ctxFunc := func(s []Context) (*Context, error) {
			
 
				+		for _, e := range s {
			
 
				+			return &e, nil
			
 
				+		}
			
 
				+
			
 
				+		return nil, fmt.Errorf("no devices available")
			
 
				+	}
			
 
				+
			
 
				+	tensors := make(map[*fs.Tensor]*Context, len(meta.Tensors().Items()))
			
 
				+	for _, t := range meta.Tensors().Items() {
			
 
				+		c, err := ctxFunc(append(gpus, cpus...))
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		func() {
			
 
				+			tt := C.ggml_new_tensor(c.ctx, t.Kind, C.int(len(t.Shape)), (*C.int64_t)(unsafe.Pointer(&t.Shape[0])))
			
 
				+
			
 
				+			cname := C.CString(t.Name)
			
 
				+			defer C.free(unsafe.Pointer(cname))
			
 
				+			C.ggml_set_name(tt, cname)
			
 
				+
			
 
				+			tensors[t] = c
			
 
				+		}()
			
 
				+	}
			
 
				+
			
 
				+	for _, b := range append(gpus, cpus...) {
			
 
				+		C.ggml_backend_alloc_ctx_tensors(b.ctx, b.backend)
			
 
				+	}
			
 
				+
			
 
				+	sr := io.NewSectionReader(r, int64(meta.Tensors().Offset), n-int64(meta.Tensors().Offset))
			
 
				+
			
 
				+	var g errgroup.Group
			
 
				+	for t, c := range tensors {
			
 
				+		g.Go(func() error {
			
 
				+			bts := make([]byte, t.Size())
			
 
				+			n, err := io.ReadFull(io.NewSectionReader(sr, int64(t.Offset), int64(t.Size())), bts)
			
 
				+			if err != nil {
			
 
				+				return err
			
 
				+			}
			
 
				+
			
 
				+			if n != int(t.Size()) {
			
 
				+				return fmt.Errorf("expected %d bytes, got %d", t.Size(), n)
			
 
				+			}
			
 
				+
			
 
				+			cname := C.CString(t.Name)
			
 
				+			defer C.free(unsafe.Pointer(cname))
			
 
				+
			
 
				+			C.ggml_backend_tensor_set(C.ggml_get_tensor(c.ctx, cname), unsafe.Pointer(&bts[0]), 0, C.size_t(n))
			
 
				+			return nil
			
 
				+		})
			
 
				+	}
			
 
				+
			
 
				+	if err := g.Wait(); err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	return &Backend{
			
 
				+		meta: meta,
			
 
				+		cpus: cpus,
			
 
				+		gpus: gpus,
			
 
				+	}, nil
			
 
				+}
			
 
				+
			
 
				+func init() {
			
 
				+	ml.RegisterBackend("ggml", New)
			
 
				+}
			
 
				+
			
 
				+func (b *Backend) Config() ml.Config {
			
 
				+	return b.meta.KV()
			
 
				+}
			
 
				+
			
 
				+func (b *Backend) Get(name string) ml.Tensor {
			
 
				+	cname := C.CString(name)
			
 
				+	defer C.free(unsafe.Pointer(cname))
			
 
				+
			
 
				+	for _, c := range append(b.gpus, b.cpus...) {
			
 
				+		if t := C.ggml_get_tensor(c.ctx, cname); t != nil {
			
 
				+			return &Tensor{t: t}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (b *Backend) NewContext() ml.Context {
			
 
				+	nodes := max(8192, len(b.meta.Tensors().Items())*5)
			
 
				+	bts := make([]byte, C.size_t(nodes)*C.ggml_tensor_overhead()+C.ggml_graph_overhead_custom(C.size_t(nodes), false))
			
 
				+	c := C.ggml_init(C.struct_ggml_init_params{
			
 
				+		mem_buffer: unsafe.Pointer(&bts[0]),
			
 
				+		mem_size:   C.size_t(len(bts)),
			
 
				+		no_alloc:   true,
			
 
				+	})
			
 
				+
			
 
				+	backends := make([]*C.struct_ggml_backend, len(b.gpus)+len(b.cpus))
			
 
				+	bufts := make([]*C.struct_ggml_backend_buffer_type, len(b.gpus)+len(b.cpus))
			
 
				+	for i, c := range append(b.gpus, b.cpus...) {
			
 
				+		backends[i] = c.backend
			
 
				+		bufts[i] = C.ggml_backend_get_default_buffer_type(c.backend)
			
 
				+	}
			
 
				+
			
 
				+	return &Context{
			
 
				+		ctx:     c,
			
 
				+		backend: backends[0],
			
 
				+		nodes:   nodes,
			
 
				+		sched: C.ggml_backend_sched_new(
			
 
				+			(*C.ggml_backend_t)(unsafe.Pointer(&backends[0])),
			
 
				+			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&bufts[0])),
			
 
				+			C.int(len(backends)),
			
 
				+			C.size_t(nodes),
			
 
				+			true,
			
 
				+		),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+type Context struct {
			
 
				+	ctx     *C.struct_ggml_context
			
 
				+	backend *C.struct_ggml_backend
			
 
				+
			
 
				+	sched *C.struct_ggml_backend_sched
			
 
				+	graph *C.struct_ggml_cgraph
			
 
				+	nodes int
			
 
				+}
			
 
				+
			
 
				+func (c *Context) Forward(t ml.Tensor) {
			
 
				+	if c.graph == nil {
			
 
				+		c.graph = C.ggml_new_graph_custom(c.ctx, C.size_t(c.nodes), false)
			
 
				+	}
			
 
				+
			
 
				+	C.ggml_build_forward_expand(c.graph, t.(*Tensor).t)
			
 
				+}
			
 
				+
			
 
				+func (c *Context) Compute(t ml.Tensor) ml.Tensor {
			
 
				+	c.Forward(t)
			
 
				+	C.ggml_backend_sched_graph_compute_async(c.sched, c.graph)
			
 
				+
			
 
				+	backend := C.ggml_backend_sched_get_tensor_backend(c.sched, t.(*Tensor).t)
			
 
				+
			
 
				+	t.(*Tensor).data = make([]byte, C.ggml_nbytes(t.(*Tensor).t))
			
 
				+	C.ggml_backend_tensor_get_async(backend, t.(*Tensor).t, unsafe.Pointer(&t.(*Tensor).data[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
			
 
				+	return t
			
 
				+}
			
 
				+
			
 
				+func (c Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
			
 
				+	if len(shape) < 1 || len(shape) > 4 {
			
 
				+		panic("unsupported number of dimensions")
			
 
				+	}
			
 
				+
			
 
				+	for _, dim := range shape {
			
 
				+		if dim < 1 {
			
 
				+			panic("invalid shape")
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	var t *C.struct_ggml_tensor
			
 
				+	switch dtype {
			
 
				+	case ml.DTypeF32:
			
 
				+		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_F32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
			
 
				+	case ml.DTypeI32:
			
 
				+		t = C.ggml_new_tensor(c.ctx, C.GGML_TYPE_I32, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
			
 
				+	default:
			
 
				+		panic("unsupported dtype")
			
 
				+	}
			
 
				+
			
 
				+	b := C.ggml_backend_alloc_buffer(c.backend, C.ggml_nbytes(t))
			
 
				+	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
			
 
				+	C.ggml_set_zero(t)
			
 
				+	return &Tensor{t: t}
			
 
				+}
			
 
				+
			
 
				+func fromSlice[S ~[]E, E float32 | int32](ctx Context, s S, shape []int, dtype uint32) (ml.Tensor, error) {
			
 
				+	n := len(s)
			
 
				+	for _, v := range shape {
			
 
				+		n /= v
			
 
				+	}
			
 
				+
			
 
				+	if n != 1 {
			
 
				+		return nil, fmt.Errorf("invalid shape %v for %d elements", shape, len(s))
			
 
				+	}
			
 
				+
			
 
				+	t := C.ggml_new_tensor(ctx.ctx, dtype, C.int(len(shape)), (*C.int64_t)(unsafe.Pointer(&shape[0])))
			
 
				+	b := C.ggml_backend_alloc_buffer(ctx.backend, C.ggml_nbytes(t))
			
 
				+	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
			
 
				+	C.ggml_backend_tensor_set(t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t))
			
 
				+	return &Tensor{t: t}, nil
			
 
				+}
			
 
				+
			
 
				+func (c Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
			
 
				+	return fromSlice(c, s, shape, C.GGML_TYPE_F32)
			
 
				+}
			
 
				+
			
 
				+func (c Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
			
 
				+	return fromSlice(c, s, shape, C.GGML_TYPE_I32)
			
 
				+}
			
 
				+
			
 
				+func (c *Context) Close() error {
			
 
				+	C.ggml_backend_sched_free(c.sched)
			
 
				+	C.ggml_free(c.ctx)
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+type Tensor struct {
			
 
				+	t    *C.struct_ggml_tensor
			
 
				+	data []byte
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) LogValue() slog.Value {
			
 
				+	return slog.GroupValue(
			
 
				+		slog.String("name", C.GoString(C.ggml_get_name(t.t))),
			
 
				+		slog.String("type", C.GoString(C.ggml_type_name(t.t._type))),
			
 
				+		slog.Any("shape", t.Shape()),
			
 
				+	)
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Dim(n int) int64 {
			
 
				+	return int64(t.t.ne[n])
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Stride(n int) int64 {
			
 
				+	return int64(t.t.nb[n])
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Shape() []int64 {
			
 
				+	shape := make([]int64, C.ggml_n_dims(t.t))
			
 
				+	for i := range shape {
			
 
				+		shape[i] = t.Dim(i)
			
 
				+	}
			
 
				+
			
 
				+	return shape
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Bytes() []byte {
			
 
				+	if bts := C.ggml_get_data(t.t); bts != nil {
			
 
				+		return C.GoBytes(bts, C.int(C.ggml_nbytes(t.t)))
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Floats() (f32s []float32) {
			
 
				+	if t.data != nil {
			
 
				+		f32s = make([]float32, C.ggml_nelements(t.t))
			
 
				+		_ = binary.Read(bytes.NewReader(t.data), binary.LittleEndian, f32s)
			
 
				+	}
			
 
				+
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) DType() ml.DType {
			
 
				+	switch t.t._type {
			
 
				+	case C.GGML_TYPE_F32:
			
 
				+		return ml.DTypeF32
			
 
				+	case C.GGML_TYPE_I32:
			
 
				+		return ml.DTypeI32
			
 
				+	default:
			
 
				+		return ml.DTypeOther
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_add(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Stack(ctx ml.Context, dim int, s ...ml.Tensor) ml.Tensor {
			
 
				+	if len(s) > 0 {
			
 
				+		return t.Concat(ctx, s[0].Stack(ctx, dim, s[1:]...), dim)
			
 
				+	}
			
 
				+
			
 
				+	return t
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Concat(ctx ml.Context, t2 ml.Tensor, dim int) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_concat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(dim)),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Contiguous(ctx ml.Context) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_cont(ctx.(*Context).ctx, t.t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_mul(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_mul_mat(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, eps float32) ml.Tensor {
			
 
				+	tt := (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
			
 
				+	if b != nil {
			
 
				+		tt = tt.Add(ctx, b)
			
 
				+	}
			
 
				+
			
 
				+	return tt
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
			
 
				+	return (&Tensor{t: C.ggml_norm(ctx.(*Context).ctx, t.t, C.float(eps))}).Mul(ctx, w)
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Pad(ctx ml.Context, shape ...int64) ml.Tensor {
			
 
				+	if len(shape) != 4 {
			
 
				+		panic("expected 4 dimensions")
			
 
				+	}
			
 
				+
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_pad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Permute(ctx ml.Context, shape ...int) ml.Tensor {
			
 
				+	if len(shape) != 4 {
			
 
				+		panic("expected 4 dimensions")
			
 
				+	}
			
 
				+
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_permute(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Rows(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_get_rows(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Copy(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_cpy(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Reshape(ctx ml.Context, shape ...int64) ml.Tensor {
			
 
				+	switch len(shape) {
			
 
				+	case 1:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_reshape_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0])),
			
 
				+		}
			
 
				+	case 2:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_reshape_2d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1])),
			
 
				+		}
			
 
				+	case 3:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_reshape_3d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2])),
			
 
				+		}
			
 
				+	case 4:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_reshape_4d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.int64_t(shape[1]), C.int64_t(shape[2]), C.int64_t(shape[3])),
			
 
				+		}
			
 
				+	default:
			
 
				+		panic("unsupported number of dimensions")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_scale(ctx.(*Context).ctx, t.t, (C.float)(s)),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_soft_max(ctx.(*Context).ctx, t.t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Tanh(ctx ml.Context) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_tanh_inplace(ctx.(*Context).ctx, t.t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Unpad(ctx ml.Context, shape ...int64) ml.Tensor {
			
 
				+	if len(shape) != 4 {
			
 
				+		panic("expected 4 dimensions")
			
 
				+	}
			
 
				+
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_unpad(ctx.(*Context).ctx, t.t, C.int(shape[0]), C.int(shape[1]), C.int(shape[2]), C.int(shape[3])),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
			
 
				+	switch len(shape) {
			
 
				+	case 1:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_view_1d(ctx.(*Context).ctx, t.t, C.int64_t(shape[0]), C.size_t(offset)),
			
 
				+		}
			
 
				+	case 3:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_view_2d(ctx.(*Context).ctx, t.t,
			
 
				+				C.int64_t(shape[0]), C.int64_t(shape[2]),
			
 
				+				C.size_t(shape[1]),
			
 
				+				C.size_t(offset)),
			
 
				+		}
			
 
				+	case 5:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_view_3d(ctx.(*Context).ctx, t.t,
			
 
				+				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]),
			
 
				+				C.size_t(shape[1]), C.size_t(shape[3]),
			
 
				+				C.size_t(offset)),
			
 
				+		}
			
 
				+	case 7:
			
 
				+		return &Tensor{
			
 
				+			t: C.ggml_view_4d(ctx.(*Context).ctx, t.t,
			
 
				+				C.int64_t(shape[0]), C.int64_t(shape[2]), C.int64_t(shape[4]), C.int64_t(shape[6]),
			
 
				+				C.size_t(shape[1]), C.size_t(shape[3]), C.size_t(shape[5]),
			
 
				+				C.size_t(offset)),
			
 
				+		}
			
 
				+	default:
			
 
				+		panic("unsupported number of dimensions")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const (
			
 
				+	ropeTypeNorm C.int = iota
			
 
				+)
			
 
				+
			
 
				+func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim uint32, ropeBase, ropeScale float32) ml.Tensor {
			
 
				+	if ropeFactors == nil {
			
 
				+		ropeFactors = &Tensor{}
			
 
				+	}
			
 
				+
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_rope_ext(
			
 
				+			ctx.(*Context).ctx, t.t, positionIDs.(*Tensor).t, ropeFactors.(*Tensor).t,
			
 
				+			C.int(ropeDim),
			
 
				+			131072,       // YaRN n_ctx_train
			
 
				+			ropeTypeNorm, // ROPE_TYPE_NORM
			
 
				+			C.float(ropeBase),
			
 
				+			C.float(ropeScale),
			
 
				+			0.,  // YaRN ext_factor
			
 
				+			1.,  // YaRN attn_factor
			
 
				+			32., // YaRN beta_fast
			
 
				+			1.,  // YaRN beta_slow
			
 
				+		),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) GELU(ctx ml.Context) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_gelu_inplace(ctx.(*Context).ctx, t.t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) SILU(ctx ml.Context) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_silu_inplace(ctx.(*Context).ctx, t.t),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (t *Tensor) Conv2D(ctx ml.Context, t2 ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
			
 
				+	return &Tensor{
			
 
				+		t: C.ggml_conv_2d(ctx.(*Context).ctx, t.t, t2.(*Tensor).t, C.int(s0), C.int(s1), C.int(p0), C.int(p1), C.int(d0), C.int(d1)),
			
 
				+	}
			
 
				+}
			
--- a/ml/nn/convolution.go
+++ b/ml/nn/convolution.go
@@ -0,0 +1,11 @@
 
				+package nn
			
 
				+
			
 
				+import "github.com/ollama/ollama/ml"
			
 
				+
			
 
				+type Conv2D struct {
			
 
				+	Weight ml.Tensor `gguf:"weight"`
			
 
				+}
			
 
				+
			
 
				+func (m *Conv2D) Forward(ctx ml.Context, t ml.Tensor, s0, s1, p0, p1, d0, d1 int) ml.Tensor {
			
 
				+	return m.Weight.Conv2D(ctx, t, s0, s1, p0, p1, d0, d1)
			
 
				+}
			
--- a/ml/nn/embedding.go
+++ b/ml/nn/embedding.go
@@ -0,0 +1,11 @@
 
				+package nn
			
 
				+
			
 
				+import "github.com/ollama/ollama/ml"
			
 
				+
			
 
				+type Embedding struct {
			
 
				+	Weight ml.Tensor `gguf:"weight"`
			
 
				+}
			
 
				+
			
 
				+func (m *Embedding) Forward(ctx ml.Context, hiddenState ml.Tensor) ml.Tensor {
			
 
				+	return m.Weight.Rows(ctx, hiddenState)
			
 
				+}
			
--- a/ml/nn/linear.go
+++ b/ml/nn/linear.go
@@ -0,0 +1,17 @@
 
				+package nn
			
 
				+
			
 
				+import "github.com/ollama/ollama/ml"
			
 
				+
			
 
				+type Linear struct {
			
 
				+	Weight ml.Tensor `gguf:"weight"`
			
 
				+	Bias   ml.Tensor `gguf:"bias"`
			
 
				+}
			
 
				+
			
 
				+func (m *Linear) Forward(ctx ml.Context, t ml.Tensor) ml.Tensor {
			
 
				+	t = m.Weight.Mulmat(ctx, t)
			
 
				+	if m.Bias != nil {
			
 
				+		t = t.Add(ctx, m.Bias)
			
 
				+	}
			
 
				+
			
 
				+	return t
			
 
				+}
			
--- a/ml/nn/normalization.go
+++ b/ml/nn/normalization.go
@@ -0,0 +1,22 @@
 
				+package nn
			
 
				+
			
 
				+import (
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+)
			
 
				+
			
 
				+type LayerNorm struct {
			
 
				+	Weight ml.Tensor `gguf:"weight"`
			
 
				+	Bias   ml.Tensor `gguf:"bias"`
			
 
				+}
			
 
				+
			
 
				+func (m *LayerNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
			
 
				+	return t.LayerNorm(ctx, m.Weight, m.Bias, eps)
			
 
				+}
			
 
				+
			
 
				+type RMSNorm struct {
			
 
				+	Weight ml.Tensor `gguf:"weight"`
			
 
				+}
			
 
				+
			
 
				+func (m *RMSNorm) Forward(ctx ml.Context, t ml.Tensor, eps float32) ml.Tensor {
			
 
				+	return t.RMSNorm(ctx, m.Weight, eps)
			
 
				+}
			
--- a/model/cmd/main.go
+++ b/model/cmd/main.go
@@ -0,0 +1,160 @@
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"errors"
			
 
				+	"flag"
			
 
				+	"fmt"
			
 
				+	"image"
			
 
				+	"io"
			
 
				+	"log/slog"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"strings"
			
 
				+
			
 
				+	"github.com/ollama/ollama/cache"
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+	_ "github.com/ollama/ollama/model/llama"
			
 
				+	_ "github.com/ollama/ollama/model/mllama"
			
 
				+	"github.com/ollama/ollama/sample"
			
 
				+)
			
 
				+
			
 
				+var args struct {
			
 
				+	n     int
			
 
				+	debug bool
			
 
				+	image string
			
 
				+	cache bool
			
 
				+}
			
 
				+
			
 
				+func temp() error {
			
 
				+	flag.IntVar(&args.n, "n", 10, "number of samples")
			
 
				+	flag.BoolVar(&args.debug, "debug", false, "enable debug logging")
			
 
				+	flag.StringVar(&args.image, "image", "", "path to image file")
			
 
				+	flag.BoolVar(&args.cache, "cache", false, "enable KV cache")
			
 
				+
			
 
				+	flag.Parse()
			
 
				+
			
 
				+	var prompt string
			
 
				+	if n := len(flag.Args()); n == 1 {
			
 
				+		bts, err := io.ReadAll(os.Stdin)
			
 
				+		if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				+		prompt = string(bts)
			
 
				+	} else if n > 1 {
			
 
				+		prompt = strings.Join(flag.Args()[1:], " ")
			
 
				+	} else {
			
 
				+		return fmt.Errorf("usage: %s path/to/file <prompt\n", filepath.Base(os.Args[0]))
			
 
				+	}
			
 
				+
			
 
				+	level := slog.LevelInfo
			
 
				+	if args.debug {
			
 
				+		level = slog.LevelDebug
			
 
				+	}
			
 
				+
			
 
				+	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
			
 
				+		Level:     level,
			
 
				+		AddSource: true,
			
 
				+		ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
			
 
				+			if attr.Key == slog.SourceKey {
			
 
				+				source := attr.Value.Any().(*slog.Source)
			
 
				+				source.File = filepath.Base(source.File)
			
 
				+			}
			
 
				+
			
 
				+			return attr
			
 
				+		},
			
 
				+	})))
			
 
				+
			
 
				+	m, err := model.New(flag.Arg(0))
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				+	inputIDs, err := m.(model.TextProcessor).Encode(prompt)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				+	var opts []model.OptionsFunc
			
 
				+	if args.cache {
			
 
				+		opts = append(opts, model.WithCache(&cache.Simple{
			
 
				+			Capacity: 2048,
			
 
				+			DType:    ml.DTypeF32,
			
 
				+		}))
			
 
				+	}
			
 
				+
			
 
				+	if args.image != "" {
			
 
				+		if err := func() error {
			
 
				+			f, err := os.Open(args.image)
			
 
				+			if err != nil {
			
 
				+				return err
			
 
				+			}
			
 
				+			defer f.Close()
			
 
				+
			
 
				+			img, _, err := image.Decode(f)
			
 
				+			if err != nil {
			
 
				+				return err
			
 
				+			}
			
 
				+
			
 
				+			opts = append(opts, model.WithImage(img))
			
 
				+			return nil
			
 
				+		}(); err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	var offset int
			
 
				+	for range args.n {
			
 
				+		logit, err := model.Forward(m, append(opts, model.WithInputIDs(inputIDs), model.WithOffset(offset))...)
			
 
				+		if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				+		f32s := logit.Floats()
			
 
				+		f64s := make([]float64, len(f32s))
			
 
				+		for i, f32 := range f32s {
			
 
				+			f64s[i] = float64(f32)
			
 
				+		}
			
 
				+
			
 
				+		// do sampling
			
 
				+		f64s, err = sample.Sample(f64s, sample.Greedy())
			
 
				+		if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				+		var outputIDs []int32
			
 
				+		for _, f64 := range f64s {
			
 
				+			if !m.(model.TextProcessor).Is(uint32(f64), model.SpecialEOS) {
			
 
				+				outputIDs = append(outputIDs, int32(f64))
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if len(outputIDs) == 0 {
			
 
				+			break
			
 
				+		}
			
 
				+
			
 
				+		s, err := m.(model.TextProcessor).Decode(outputIDs)
			
 
				+		if errors.Is(err, io.EOF) {
			
 
				+			break
			
 
				+		} else if err != nil {
			
 
				+			return err
			
 
				+		}
			
 
				+
			
 
				+		fmt.Print(s)
			
 
				+
			
 
				+		inputIDs = append(inputIDs, outputIDs...)
			
 
				+		if args.cache {
			
 
				+			offset = len(inputIDs) - 1
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func main() {
			
 
				+	if err := temp(); err != nil {
			
 
				+		fmt.Println("err", err)
			
 
				+		os.Exit(1)
			
 
				+	}
			
 
				+}
			
--- a/model/llama/model.go
+++ b/model/llama/model.go
@@ -0,0 +1,155 @@
 
				+package llama
			
 
				+
			
 
				+import (
			
 
				+	"math"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/ml/nn"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+)
			
 
				+
			
 
				+type Options struct {
			
 
				+	RopeFactors                      ml.Tensor `gguf:"rope_freqs.weight"`
			
 
				+	hiddenSize, numHeads, numKVHeads int64
			
 
				+	eps, ropeBase, ropeScale         float32
			
 
				+	ropeDim                          uint32
			
 
				+}
			
 
				+
			
 
				+type Model struct {
			
 
				+	model.Base
			
 
				+	model.BytePairEncoding
			
 
				+
			
 
				+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
			
 
				+	Layers         []Layer       `gguf:"blk"`
			
 
				+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
			
 
				+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
			
 
				+
			
 
				+	*Options
			
 
				+}
			
 
				+
			
 
				+func New(c ml.Config) (model.Model, error) {
			
 
				+	return &Model{
			
 
				+		BytePairEncoding: model.BytePairEncoding{
			
 
				+			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
			
 
				+			Vocabulary: &model.Vocabulary{
			
 
				+				Values: c.Strings("tokenizer.ggml.tokens"),
			
 
				+				Types:  c.Uints("tokenizer.ggml.token_type"),
			
 
				+				Merges: c.Strings("tokenizer.ggml.merges"),
			
 
				+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
			
 
				+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
			
 
				+			},
			
 
				+		},
			
 
				+		Layers: make([]Layer, c.Uint("block_count")),
			
 
				+		Options: &Options{
			
 
				+			hiddenSize: int64(c.Uint("embedding_length")),
			
 
				+			numHeads:   int64(c.Uint("attention.head_count")),
			
 
				+			numKVHeads: int64(c.Uint("attention.head_count_kv")),
			
 
				+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
			
 
				+			ropeBase:   c.Float("rope.freq_base"),
			
 
				+			ropeScale:  c.Float("rope.freq_scale", 1),
			
 
				+			ropeDim:    c.Uint("rope.dimension_count"),
			
 
				+		},
			
 
				+	}, nil
			
 
				+}
			
 
				+
			
 
				+type SelfAttention struct {
			
 
				+	Query  *nn.Linear `gguf:"attn_q"`
			
 
				+	Key    *nn.Linear `gguf:"attn_k"`
			
 
				+	Value  *nn.Linear `gguf:"attn_v"`
			
 
				+	Output *nn.Linear `gguf:"attn_output"`
			
 
				+}
			
 
				+
			
 
				+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
			
 
				+	batchSize := hiddenState.Dim(1)
			
 
				+	headDim := opts.hiddenSize / opts.numHeads
			
 
				+
			
 
				+	q := sa.Query.Forward(ctx, hiddenState)
			
 
				+	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
			
 
				+	q = q.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
			
 
				+
			
 
				+	k := sa.Key.Forward(ctx, hiddenState)
			
 
				+	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
			
 
				+	k = k.RoPE(ctx, positionIDs, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
			
 
				+
			
 
				+	v := sa.Value.Forward(ctx, hiddenState)
			
 
				+	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
			
 
				+
			
 
				+	k, v = cache.Put(ctx, k, v, cache.Options)
			
 
				+
			
 
				+	q = q.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	k = k.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
			
 
				+
			
 
				+	kq := k.Mulmat(ctx, q)
			
 
				+	kq = kq.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
			
 
				+	kq = kq.Softmax(ctx)
			
 
				+
			
 
				+	kqv := v.Mulmat(ctx, kq)
			
 
				+	kqv = kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
			
 
				+
			
 
				+	return sa.Output.Forward(ctx, kqv)
			
 
				+}
			
 
				+
			
 
				+type MLP struct {
			
 
				+	Up   *nn.Linear `gguf:"ffn_up"`
			
 
				+	Down *nn.Linear `gguf:"ffn_down"`
			
 
				+	Gate *nn.Linear `gguf:"ffn_gate"`
			
 
				+}
			
 
				+
			
 
				+func (mlp *MLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *Options) ml.Tensor {
			
 
				+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
			
 
				+	return mlp.Down.Forward(ctx, hiddenState)
			
 
				+}
			
 
				+
			
 
				+type Layer struct {
			
 
				+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
			
 
				+	SelfAttention *SelfAttention
			
 
				+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
			
 
				+	MLP           *MLP
			
 
				+}
			
 
				+
			
 
				+func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache model.Cache, opts *Options) ml.Tensor {
			
 
				+	residual := hiddenState
			
 
				+
			
 
				+	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
			
 
				+	hiddenState = hiddenState.Add(ctx, residual)
			
 
				+	residual = hiddenState
			
 
				+
			
 
				+	hiddenState = l.MLPNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = l.MLP.Forward(ctx, hiddenState, opts)
			
 
				+	return hiddenState.Add(ctx, residual)
			
 
				+}
			
 
				+
			
 
				+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
			
 
				+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	hiddenState := m.TokenEmbedding.Forward(ctx, inputs)
			
 
				+
			
 
				+	for i, layer := range m.Layers {
			
 
				+		hiddenState = layer.Forward(ctx, hiddenState, positions, opts.Cache.Sub(i), m.Options)
			
 
				+	}
			
 
				+
			
 
				+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
			
 
				+	hiddenState = m.Output.Forward(ctx, hiddenState)
			
 
				+
			
 
				+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState.Rows(ctx, outputs), nil
			
 
				+}
			
 
				+
			
 
				+func init() {
			
 
				+	model.Register("llama", New)
			
 
				+}
			
--- a/model/mllama/model.go
+++ b/model/mllama/model.go
@@ -0,0 +1,90 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/ml/nn"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+)
			
 
				+
			
 
				+type Model struct {
			
 
				+	model.Base
			
 
				+
			
 
				+	*VisionModel `gguf:"v,vision"`
			
 
				+	*TextModel
			
 
				+
			
 
				+	Projector *nn.Linear `gguf:"mm.0"`
			
 
				+
			
 
				+	ImageProcessor
			
 
				+	TextProcessor
			
 
				+}
			
 
				+
			
 
				+func New(c ml.Config) (model.Model, error) {
			
 
				+	return &Model{
			
 
				+		ImageProcessor: newImageProcessor(c),
			
 
				+		VisionModel:    newVisionModel(c),
			
 
				+		TextProcessor:  newTextProcessor(c),
			
 
				+		TextModel:      newTextModel(c),
			
 
				+	}, nil
			
 
				+}
			
 
				+
			
 
				+func (m *Model) Forward(ctx ml.Context, opts model.Options) (ml.Tensor, error) {
			
 
				+	var crossAttentionStates ml.Tensor
			
 
				+	if opts.Images != nil {
			
 
				+		f32s, aspectRatioID, err := m.ImageProcessor.ProcessImage(opts.Images[0])
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		pixelValues, err := ctx.FromFloatSlice(f32s,
			
 
				+			m.ImageProcessor.imageSize,
			
 
				+			m.ImageProcessor.imageSize,
			
 
				+			m.ImageProcessor.numChannels,
			
 
				+			m.ImageProcessor.maxNumTiles,
			
 
				+		)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		aspectRatio, err := ctx.FromIntSlice([]int32{int32(aspectRatioID)}, 1)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		positions := make([]int32, 1601)
			
 
				+		for i := range positions {
			
 
				+			positions[i] = int32(i)
			
 
				+		}
			
 
				+
			
 
				+		positionIDs, err := ctx.FromIntSlice(positions, len(positions))
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		crossAttentionStates = m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
			
 
				+		crossAttentionStates = m.Projector.Forward(ctx, crossAttentionStates)
			
 
				+	}
			
 
				+
			
 
				+	inputs, err := ctx.FromIntSlice(opts.Inputs(), len(opts.Inputs()))
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	positions, err := ctx.FromIntSlice(opts.Positions(), len(opts.Positions()))
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	// TODO: attention mask, cross attention mask
			
 
				+	hiddenState := m.TextModel.Forward(ctx, inputs, positions, nil, crossAttentionStates, nil, opts.Cache)
			
 
				+
			
 
				+	outputs, err := ctx.FromIntSlice([]int32{int32(len(opts.Positions())) - 1}, 1)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState.Rows(ctx, outputs), nil
			
 
				+}
			
 
				+
			
 
				+func init() {
			
 
				+	model.Register("mllama", New)
			
 
				+}
			
--- a/model/mllama/model_text.go
+++ b/model/mllama/model_text.go
@@ -0,0 +1,225 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"math"
			
 
				+	"slices"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/ml/nn"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+)
			
 
				+
			
 
				+type TextSelfAttention struct {
			
 
				+	Query  *nn.Linear `gguf:"attn_q"`
			
 
				+	Key    *nn.Linear `gguf:"attn_k"`
			
 
				+	Value  *nn.Linear `gguf:"attn_v"`
			
 
				+	Output *nn.Linear `gguf:"attn_output"`
			
 
				+}
			
 
				+
			
 
				+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions, mask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
			
 
				+	batchSize := hiddenState.Dim(1)
			
 
				+	headDim := opts.hiddenSize / opts.numHeads
			
 
				+
			
 
				+	query := sa.Query.Forward(ctx, hiddenState)
			
 
				+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
			
 
				+	query = query.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
			
 
				+
			
 
				+	key := sa.Key.Forward(ctx, hiddenState)
			
 
				+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
			
 
				+	key = key.RoPE(ctx, positions, opts.RopeFactors, opts.ropeDim, opts.ropeBase, opts.ropeScale)
			
 
				+
			
 
				+	value := sa.Value.Forward(ctx, hiddenState)
			
 
				+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
			
 
				+
			
 
				+	key, value = cache.Put(ctx, key, value, cache.Options)
			
 
				+
			
 
				+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
			
 
				+
			
 
				+	scores := key.Mulmat(ctx, query)
			
 
				+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
			
 
				+
			
 
				+	if mask != nil {
			
 
				+		scores = scores.Add(ctx, mask)
			
 
				+	}
			
 
				+
			
 
				+	scores = scores.Softmax(ctx)
			
 
				+
			
 
				+	attention := value.Mulmat(ctx, scores)
			
 
				+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
			
 
				+
			
 
				+	return sa.Output.Forward(ctx, attention)
			
 
				+}
			
 
				+
			
 
				+type TextMLP struct {
			
 
				+	Up   *nn.Linear `gguf:"ffn_up"`
			
 
				+	Down *nn.Linear `gguf:"ffn_down"`
			
 
				+	Gate *nn.Linear `gguf:"ffn_gate"`
			
 
				+}
			
 
				+
			
 
				+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *TextModelOptions) ml.Tensor {
			
 
				+	hiddenState = mlp.Gate.Forward(ctx, hiddenState).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenState))
			
 
				+	return mlp.Down.Forward(ctx, hiddenState)
			
 
				+}
			
 
				+
			
 
				+type TextSelfAttentionDecoderLayer struct {
			
 
				+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
			
 
				+	SelfAttention *TextSelfAttention
			
 
				+
			
 
				+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
			
 
				+	MLP     *TextMLP
			
 
				+}
			
 
				+
			
 
				+func (d *TextSelfAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, positions, mask, _, _ ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
			
 
				+	residual := hiddenState
			
 
				+
			
 
				+	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = d.SelfAttention.Forward(ctx, hiddenState, positions, mask, cache, opts)
			
 
				+	hiddenState = hiddenState.Add(ctx, residual)
			
 
				+	residual = hiddenState
			
 
				+
			
 
				+	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
			
 
				+	return hiddenState.Add(ctx, residual)
			
 
				+}
			
 
				+
			
 
				+type TextCrossAttention struct {
			
 
				+	QueryNorm *nn.RMSNorm `gguf:"cross_attn_q_norm"`
			
 
				+	Query     *nn.Linear  `gguf:"cross_attn_q_proj"`
			
 
				+	KeyNorm   *nn.RMSNorm `gguf:"cross_attn_k_norm"`
			
 
				+	Key       *nn.Linear  `gguf:"cross_attn_k_proj"`
			
 
				+	Value     *nn.Linear  `gguf:"cross_attn_v_proj"`
			
 
				+	Output    *nn.Linear  `gguf:"cross_attn_o_proj"`
			
 
				+}
			
 
				+
			
 
				+func (ca *TextCrossAttention) Forward(ctx ml.Context, hiddenState, crossAttentionStates ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
			
 
				+	batchSize := hiddenState.Dim(1)
			
 
				+	headDim := opts.hiddenSize / opts.numHeads
			
 
				+	numVisionTokens, numTiles := crossAttentionStates.Dim(1), crossAttentionStates.Dim(2)
			
 
				+
			
 
				+	query := ca.Query.Forward(ctx, hiddenState)
			
 
				+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
			
 
				+	query = ca.QueryNorm.Forward(ctx, query, opts.eps)
			
 
				+
			
 
				+	key := ca.Key.Forward(ctx, crossAttentionStates)
			
 
				+	key = key.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
			
 
				+	key = ca.KeyNorm.Forward(ctx, key, opts.eps)
			
 
				+
			
 
				+	value := ca.Value.Forward(ctx, crossAttentionStates)
			
 
				+	value = value.Reshape(ctx, headDim, opts.numKVHeads, numVisionTokens*numTiles)
			
 
				+
			
 
				+	// TODO cache key, value
			
 
				+
			
 
				+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
			
 
				+
			
 
				+	scores := key.Mulmat(ctx, query)
			
 
				+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
			
 
				+	scores = scores.Softmax(ctx)
			
 
				+
			
 
				+	attention := value.Mulmat(ctx, scores)
			
 
				+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
			
 
				+
			
 
				+	return ca.Output.Forward(ctx, attention)
			
 
				+}
			
 
				+
			
 
				+type TextCrossAttentionDecoderLayer struct {
			
 
				+	AttentionNorm  *nn.RMSNorm `gguf:"attn_norm"`
			
 
				+	CrossAttention *TextCrossAttention
			
 
				+	AttentionGate  ml.Tensor `gguf:"cross_attn_attn_gate"`
			
 
				+
			
 
				+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
			
 
				+	MLP     *TextMLP
			
 
				+	MLPGate ml.Tensor `gguf:"cross_attn_mlp_gate"`
			
 
				+}
			
 
				+
			
 
				+func (d TextCrossAttentionDecoderLayer) Forward(ctx ml.Context, hiddenState, _, _, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
			
 
				+	residual := hiddenState
			
 
				+
			
 
				+	hiddenState = d.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = d.CrossAttention.Forward(ctx, hiddenState, crossAttentionStates, cache, opts)
			
 
				+	hiddenState = hiddenState.Mul(ctx, d.AttentionGate.Tanh(ctx))
			
 
				+	hiddenState = hiddenState.Add(ctx, residual)
			
 
				+	residual = hiddenState
			
 
				+
			
 
				+	hiddenState = d.MLPNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = d.MLP.Forward(ctx, hiddenState, opts)
			
 
				+	hiddenState = hiddenState.Mul(ctx, d.MLPGate.Tanh(ctx))
			
 
				+	return hiddenState.Add(ctx, residual)
			
 
				+}
			
 
				+
			
 
				+type TextDecoderLayer interface {
			
 
				+	Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor
			
 
				+}
			
 
				+
			
 
				+type TextDecoder struct {
			
 
				+	Layers []TextDecoderLayer
			
 
				+}
			
 
				+
			
 
				+func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache, opts *TextModelOptions) ml.Tensor {
			
 
				+	for i, layer := range d.Layers {
			
 
				+		if !slices.Contains(opts.crossAttentionLayers, uint32(i)) || crossAttentionStates != nil {
			
 
				+			hiddenState = layer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache.Sub(i), opts)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState
			
 
				+}
			
 
				+
			
 
				+type TextModelOptions struct {
			
 
				+	RopeFactors ml.Tensor `gguf:"rope_freqs.weight"`
			
 
				+
			
 
				+	hiddenSize, numHeads, numKVHeads int64
			
 
				+	eps, ropeBase, ropeScale         float32
			
 
				+	ropeDim                          uint32
			
 
				+
			
 
				+	crossAttentionLayers []uint32
			
 
				+}
			
 
				+
			
 
				+type TextModel struct {
			
 
				+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
			
 
				+	Transformer    *TextDecoder  `gguf:"blk"`
			
 
				+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
			
 
				+	Output         *nn.Linear    `gguf:"output"`
			
 
				+
			
 
				+	*TextModelOptions
			
 
				+}
			
 
				+
			
 
				+func (m *TextModel) Forward(ctx ml.Context, inputIDs, positionIDs, mask, crossAttentionStates, crossAttentionMask ml.Tensor, cache model.Cache) ml.Tensor {
			
 
				+	hiddenState := m.TokenEmbedding.Forward(ctx, inputIDs)
			
 
				+	hiddenState = m.Transformer.Forward(ctx, hiddenState, positionIDs, mask, crossAttentionStates, crossAttentionMask, cache, m.TextModelOptions)
			
 
				+	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
			
 
				+	return m.Output.Forward(ctx, hiddenState)
			
 
				+}
			
 
				+
			
 
				+func newTextModel(c ml.Config) *TextModel {
			
 
				+	var decoderLayers []TextDecoderLayer
			
 
				+	for i := range c.Uint("block_count") {
			
 
				+		var textDecoderLayer TextDecoderLayer
			
 
				+		if slices.Contains(c.Uints("attention.cross_attention_layers"), i) {
			
 
				+			textDecoderLayer = &TextCrossAttentionDecoderLayer{}
			
 
				+		} else {
			
 
				+			textDecoderLayer = &TextSelfAttentionDecoderLayer{}
			
 
				+		}
			
 
				+
			
 
				+		decoderLayers = append(decoderLayers, textDecoderLayer)
			
 
				+	}
			
 
				+
			
 
				+	return &TextModel{
			
 
				+		Transformer: &TextDecoder{Layers: decoderLayers},
			
 
				+		TextModelOptions: &TextModelOptions{
			
 
				+			hiddenSize:           int64(c.Uint("embedding_length")),
			
 
				+			numHeads:             int64(c.Uint("attention.head_count")),
			
 
				+			numKVHeads:           int64(c.Uint("attention.head_count_kv")),
			
 
				+			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
			
 
				+			ropeBase:             c.Float("rope.freq_base"),
			
 
				+			ropeScale:            c.Float("rope.freq_scale", 1),
			
 
				+			ropeDim:              c.Uint("rope.dimension_count"),
			
 
				+			crossAttentionLayers: c.Uints("attention.cross_attention_layers"),
			
 
				+		},
			
 
				+	}
			
 
				+}
			
--- a/model/mllama/model_vision.go
+++ b/model/mllama/model_vision.go
@@ -0,0 +1,234 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"math"
			
 
				+	"slices"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/ml/nn"
			
 
				+)
			
 
				+
			
 
				+var batchSize int64 = 1
			
 
				+
			
 
				+type VisionSelfAttention struct {
			
 
				+	Query  *nn.Linear `gguf:"attn_q"`
			
 
				+	Key    *nn.Linear `gguf:"attn_k"`
			
 
				+	Value  *nn.Linear `gguf:"attn_v"`
			
 
				+	Output *nn.Linear `gguf:"attn_out"`
			
 
				+
			
 
				+	Gate ml.Tensor `gguf:"attn_gate"`
			
 
				+}
			
 
				+
			
 
				+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
			
 
				+	headDim := opts.hiddenSize / opts.numHeads
			
 
				+
			
 
				+	query := sa.Query.Forward(ctx, hiddenState)
			
 
				+	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
			
 
				+	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+
			
 
				+	key := sa.Key.Forward(ctx, hiddenState)
			
 
				+	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
			
 
				+	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+
			
 
				+	value := sa.Value.Forward(ctx, hiddenState)
			
 
				+	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
			
 
				+	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
			
 
				+
			
 
				+	scores := key.Mulmat(ctx, query)
			
 
				+	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
			
 
				+	scores = scores.Softmax(ctx)
			
 
				+
			
 
				+	attention := value.Mulmat(ctx, scores)
			
 
				+	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
			
 
				+	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
			
 
				+	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
			
 
				+
			
 
				+	hiddenState = sa.Output.Forward(ctx, attention)
			
 
				+	if sa.Gate != nil {
			
 
				+		hiddenState = hiddenState.Mul(ctx, sa.Gate)
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState
			
 
				+}
			
 
				+
			
 
				+type VisionMLP struct {
			
 
				+	Down *nn.Linear `gguf:"ffn_down"`
			
 
				+	Up   *nn.Linear `gguf:"ffn_up"`
			
 
				+
			
 
				+	Gate ml.Tensor `gguf:"ffn_gate"`
			
 
				+}
			
 
				+
			
 
				+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
			
 
				+	hiddenState = mlp.Down.Forward(ctx, hiddenState).GELU(ctx)
			
 
				+	hiddenState = mlp.Up.Forward(ctx, hiddenState)
			
 
				+	if mlp.Gate != nil {
			
 
				+		hiddenState = hiddenState.Mul(ctx, mlp.Gate)
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState
			
 
				+}
			
 
				+
			
 
				+type VisionEncoderLayer struct {
			
 
				+	AttentionNorm *nn.LayerNorm `gguf:"ln1"`
			
 
				+	SelfAttention *VisionSelfAttention
			
 
				+
			
 
				+	MLPNorm *nn.LayerNorm `gguf:"ln2"`
			
 
				+	MLP     *VisionMLP
			
 
				+}
			
 
				+
			
 
				+func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
			
 
				+	residual := hiddenState
			
 
				+
			
 
				+	// self attention
			
 
				+	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
			
 
				+	hiddenState = hiddenState.Add(ctx, residual)
			
 
				+	residual = hiddenState
			
 
				+
			
 
				+	// feed forward
			
 
				+	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
			
 
				+	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
			
 
				+	return hiddenState.Add(ctx, residual)
			
 
				+}
			
 
				+
			
 
				+type VisionEncoder struct {
			
 
				+	Layers []VisionEncoderLayer
			
 
				+}
			
 
				+
			
 
				+func (e *VisionEncoder) Forward(ctx ml.Context, hiddenState ml.Tensor, intermediateLayersIndices []uint32, opts *VisionModelOptions) (ml.Tensor, []ml.Tensor) {
			
 
				+	var intermediateHiddenStates []ml.Tensor
			
 
				+	for i, layer := range e.Layers {
			
 
				+		if slices.Contains(intermediateLayersIndices, uint32(i)) {
			
 
				+			intermediateHiddenStates = append(intermediateHiddenStates, hiddenState.Reshape(ctx, append([]int64{1}, hiddenState.Shape()...)...))
			
 
				+		}
			
 
				+
			
 
				+		hiddenState = layer.Forward(ctx, hiddenState, opts)
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState, intermediateHiddenStates
			
 
				+}
			
 
				+
			
 
				+type PrecomputedAspectRatioEmbedding struct {
			
 
				+	Embedding *nn.Embedding
			
 
				+	Gate      ml.Tensor `gguf:"gate"`
			
 
				+}
			
 
				+
			
 
				+func (e *PrecomputedAspectRatioEmbedding) Forward(ctx ml.Context, hiddenState ml.Tensor, aspectRatioIDs ml.Tensor, opts *VisionModelOptions) ml.Tensor {
			
 
				+	embeddings := e.Embedding.Forward(ctx, aspectRatioIDs)
			
 
				+	embeddings = embeddings.Reshape(ctx, opts.hiddenSize, 1, opts.numTiles)
			
 
				+	if e.Gate != nil {
			
 
				+		embeddings = embeddings.Mul(ctx, e.Gate)
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState.Add(ctx, embeddings)
			
 
				+}
			
 
				+
			
 
				+type PrecomputedPositionEmbedding struct {
			
 
				+	PositionEmbedding     *nn.Embedding `gguf:"position_embd"`
			
 
				+	PositionEmbeddingGate ml.Tensor     `gguf:"position_embd.gate"`
			
 
				+
			
 
				+	TilePositionEmbedding     *nn.Embedding `gguf:"tile_position_embd"`
			
 
				+	TilePositionEmbeddingGate ml.Tensor     `gguf:"tile_position_embd.gate"`
			
 
				+}
			
 
				+
			
 
				+func (e *PrecomputedPositionEmbedding) Forward(ctx ml.Context, hiddenState, positionIDs, aspectRatioIDs ml.Tensor, numPositions int64, opts *VisionModelOptions) ml.Tensor {
			
 
				+	positionEmbedding := e.PositionEmbedding.Forward(ctx, positionIDs)
			
 
				+	if e.PositionEmbeddingGate != nil {
			
 
				+		positionEmbedding = positionEmbedding.Mul(ctx, e.PositionEmbeddingGate)
			
 
				+	}
			
 
				+
			
 
				+	hiddenState = hiddenState.Add(ctx, positionEmbedding)
			
 
				+
			
 
				+	tilePositionEmbedding := e.TilePositionEmbedding.Forward(ctx, aspectRatioIDs)
			
 
				+	tilePositionEmbedding = tilePositionEmbedding.Reshape(ctx, opts.hiddenSize, numPositions, opts.numTiles)
			
 
				+	if e.TilePositionEmbeddingGate != nil {
			
 
				+		tilePositionEmbedding = tilePositionEmbedding.Mul(ctx, e.TilePositionEmbeddingGate)
			
 
				+	}
			
 
				+
			
 
				+	return hiddenState.Add(ctx, tilePositionEmbedding)
			
 
				+}
			
 
				+
			
 
				+type VisionModelOptions struct {
			
 
				+	hiddenSize, numHeads, numTiles int64
			
 
				+	imageSize, patchSize           int
			
 
				+	eps                            float32
			
 
				+
			
 
				+	intermediateLayersIndices []uint32
			
 
				+}
			
 
				+
			
 
				+type VisionModel struct {
			
 
				+	PatchEmbeddings *nn.Conv2D `gguf:"patch_embd"`
			
 
				+
			
 
				+	PreTilePositionEmbedding  *PrecomputedAspectRatioEmbedding `gguf:"pre_tile_position_embd"`
			
 
				+	PostTilePositionEmbedding *PrecomputedAspectRatioEmbedding `gguf:"post_tile_position_embd"`
			
 
				+	PositionEmbedding         *PrecomputedPositionEmbedding
			
 
				+
			
 
				+	PreLayerNorm   *nn.LayerNorm `gguf:"pre_ln"`
			
 
				+	PostLayerNorm  *nn.LayerNorm `gguf:"post_ln"`
			
 
				+	ClassEmbedding ml.Tensor     `gguf:"class_embd"`
			
 
				+
			
 
				+	Transformer       *VisionEncoder `gguf:"blk"`
			
 
				+	GlobalTransformer *VisionEncoder `gguf:"global.blk"`
			
 
				+
			
 
				+	*VisionModelOptions
			
 
				+}
			
 
				+
			
 
				+func (m *VisionModel) Forward(ctx ml.Context, pixelValues, positionIDs, aspectRatioIDs ml.Tensor) ml.Tensor {
			
 
				+	numPatches := int64((m.imageSize / m.patchSize) * (m.imageSize / m.patchSize))
			
 
				+	numPositions := numPatches
			
 
				+	if m.ClassEmbedding != nil {
			
 
				+		numPositions++
			
 
				+	}
			
 
				+
			
 
				+	hiddenState := m.PatchEmbeddings.Forward(ctx, pixelValues, m.patchSize, m.patchSize, 0, 0, 1, 1)
			
 
				+	hiddenState = hiddenState.Reshape(ctx, numPatches, m.hiddenSize, m.numTiles)
			
 
				+	hiddenState = hiddenState.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
			
 
				+
			
 
				+	hiddenState = m.PreTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
			
 
				+	hiddenState = m.ClassEmbedding.Stack(ctx, 2, slices.Repeat([]ml.Tensor{m.ClassEmbedding}, int(m.numTiles)-1)...).Concat(ctx, hiddenState, 1)
			
 
				+
			
 
				+	hiddenState = m.PositionEmbedding.Forward(ctx, hiddenState, positionIDs, aspectRatioIDs, numPositions, m.VisionModelOptions)
			
 
				+	hiddenState = m.PreLayerNorm.Forward(ctx, hiddenState, m.eps)
			
 
				+
			
 
				+	numPaddingPatches := 8 - (hiddenState.Dim(1)%8)%8
			
 
				+	hiddenState = hiddenState.Pad(ctx, 0, numPaddingPatches, 0, 0)
			
 
				+
			
 
				+	hiddenState = hiddenState.Reshape(ctx, hiddenState.Dim(0), hiddenState.Dim(1)*hiddenState.Dim(2), batchSize)
			
 
				+	hiddenState, intermediateHiddenStates := m.Transformer.Forward(ctx, hiddenState, m.intermediateLayersIndices, m.VisionModelOptions)
			
 
				+
			
 
				+	hiddenState = m.PostLayerNorm.Forward(ctx, hiddenState, m.eps)
			
 
				+
			
 
				+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
			
 
				+	hiddenState = m.PostTilePositionEmbedding.Forward(ctx, hiddenState, aspectRatioIDs, m.VisionModelOptions)
			
 
				+
			
 
				+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, m.numTiles*(numPositions+numPaddingPatches), batchSize)
			
 
				+	hiddenState, _ = m.GlobalTransformer.Forward(ctx, hiddenState, nil, m.VisionModelOptions)
			
 
				+
			
 
				+	hiddenStates := intermediateHiddenStates[0].Stack(ctx, 0, intermediateHiddenStates[1:]...)
			
 
				+	hiddenStates = hiddenStates.Reshape(ctx, int64(len(intermediateHiddenStates))*m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
			
 
				+	hiddenStates = hiddenStates.Unpad(ctx, 0, numPaddingPatches, 0, 0)
			
 
				+
			
 
				+	hiddenState = hiddenState.Reshape(ctx, m.hiddenSize, numPositions+numPaddingPatches, m.numTiles, batchSize)
			
 
				+	hiddenState = hiddenState.Unpad(ctx, 0, numPaddingPatches, 0, 0)
			
 
				+	return hiddenState.Concat(ctx, hiddenStates, 0)
			
 
				+}
			
 
				+
			
 
				+func newVisionModel(c ml.Config) *VisionModel {
			
 
				+	return &VisionModel{
			
 
				+		Transformer:       &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count"))},
			
 
				+		GlobalTransformer: &VisionEncoder{Layers: make([]VisionEncoderLayer, c.Uint("vision.global.block_count"))},
			
 
				+
			
 
				+		VisionModelOptions: &VisionModelOptions{
			
 
				+			hiddenSize: int64(c.Uint("vision.embedding_length")),
			
 
				+			numHeads:   int64(c.Uint("vision.attention.head_count")),
			
 
				+			numTiles:   int64(c.Uint("vision.max_num_tiles")),
			
 
				+
			
 
				+			imageSize: int(c.Uint("vision.image_size")),
			
 
				+			patchSize: int(c.Uint("vision.patch_size")),
			
 
				+
			
 
				+			eps: c.Float("vision.attention.layer_norm_epsilon"),
			
 
				+
			
 
				+			intermediateLayersIndices: c.Uints("vision.intermediate_layers_indices"),
			
 
				+		},
			
 
				+	}
			
 
				+}
			
--- a/model/mllama/process_image.go
+++ b/model/mllama/process_image.go
@@ -0,0 +1,240 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"image"
			
 
				+	"image/color"
			
 
				+	"math"
			
 
				+	"slices"
			
 
				+
			
 
				+	"golang.org/x/image/draw"
			
 
				+
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+)
			
 
				+
			
 
				+type ImageProcessor struct {
			
 
				+	imageSize, numChannels, maxNumTiles int
			
 
				+}
			
 
				+
			
 
				+func newImageProcessor(c ml.Config) ImageProcessor {
			
 
				+	return ImageProcessor{
			
 
				+		imageSize:   int(c.Uint("vision.image_size")),
			
 
				+		numChannels: int(c.Uint("vision.num_channels")),
			
 
				+		maxNumTiles: int(c.Uint("vision.max_num_tiles")),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) supportedAspectRatios(maxTiles int) []image.Point {
			
 
				+	ratios := []image.Point{}
			
 
				+
			
 
				+	for w := range maxTiles {
			
 
				+		for h := range maxTiles {
			
 
				+			if (w+1)*(h+1) <= maxTiles {
			
 
				+				ratios = append(ratios, image.Point{w + 1, h + 1})
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return ratios
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) clip(a, a_min, a_max int) int {
			
 
				+	if a < a_min {
			
 
				+		return a_min
			
 
				+	} else if a > a_max {
			
 
				+		return a_max
			
 
				+	}
			
 
				+
			
 
				+	return a
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) fitToCanvas(imageSize, canvasSize image.Point, tileSize int) image.Point {
			
 
				+	targetWidth := p.clip(imageSize.X, tileSize, canvasSize.X)
			
 
				+	targetHeight := p.clip(imageSize.Y, tileSize, canvasSize.Y)
			
 
				+
			
 
				+	scaleWidth := float64(targetWidth) / float64(imageSize.X)
			
 
				+	scaleHeight := float64(targetHeight) / float64(imageSize.Y)
			
 
				+
			
 
				+	var w, h int
			
 
				+
			
 
				+	if scaleWidth < scaleHeight {
			
 
				+		w = targetWidth
			
 
				+		h = min(int(math.Floor(float64(imageSize.Y)*scaleWidth)), targetHeight)
			
 
				+	} else {
			
 
				+		w = min(int(math.Floor(float64(imageSize.X)*scaleHeight)), targetWidth)
			
 
				+		h = targetHeight
			
 
				+	}
			
 
				+
			
 
				+	return image.Point{w, h}
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) optimalTiledCanvas(imageSize image.Point, maxImageTiles, tileSize int) image.Point {
			
 
				+	possibleTileArrangements := p.supportedAspectRatios(maxImageTiles)
			
 
				+	possibleCanvasSizes := []image.Point{}
			
 
				+	for _, pta := range possibleTileArrangements {
			
 
				+		possibleCanvasSizes = append(possibleCanvasSizes, image.Point{pta.X * tileSize, pta.Y * tileSize})
			
 
				+	}
			
 
				+
			
 
				+	scales := []float64{}
			
 
				+
			
 
				+	for _, pcs := range possibleCanvasSizes {
			
 
				+		scaleHeight := float64(pcs.Y) / float64(imageSize.Y)
			
 
				+		scaleWidth := float64(pcs.X) / float64(imageSize.X)
			
 
				+
			
 
				+		if scaleWidth > scaleHeight {
			
 
				+			scales = append(scales, scaleHeight)
			
 
				+		} else {
			
 
				+			scales = append(scales, scaleWidth)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	var minUpscale float64
			
 
				+	var maxDownscale float64
			
 
				+	var upscale bool
			
 
				+
			
 
				+	for _, s := range scales {
			
 
				+		if s > 1.0 {
			
 
				+			upscale = true
			
 
				+			if minUpscale == 0 {
			
 
				+				minUpscale = s
			
 
				+			} else {
			
 
				+				minUpscale = math.Min(minUpscale, s)
			
 
				+			}
			
 
				+		} else {
			
 
				+			maxDownscale = math.Max(maxDownscale, s)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	selectedScale := maxDownscale
			
 
				+	if upscale {
			
 
				+		selectedScale = minUpscale
			
 
				+	}
			
 
				+
			
 
				+	var selectedCanvas image.Point
			
 
				+	for n, pcs := range possibleCanvasSizes {
			
 
				+		if scales[n] == selectedScale {
			
 
				+			// choose the smallest possible canvas
			
 
				+			if selectedCanvas.X == 0 && selectedCanvas.Y == 0 {
			
 
				+				selectedCanvas = pcs
			
 
				+			} else if pcs.X*pcs.Y < selectedCanvas.X*selectedCanvas.Y {
			
 
				+				selectedCanvas = pcs
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return selectedCanvas
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) splitToTiles(img image.Image, numTilesSize image.Point) []image.Image {
			
 
				+	b := img.Bounds()
			
 
				+	width := b.Max.X - b.Min.X
			
 
				+	height := b.Max.Y - b.Min.Y
			
 
				+	tileHeight := height / numTilesSize.Y
			
 
				+	tileWidth := width / numTilesSize.X
			
 
				+
			
 
				+	images := []image.Image{}
			
 
				+
			
 
				+	for h := range numTilesSize.Y {
			
 
				+		for w := range numTilesSize.X {
			
 
				+			rect := image.Rect(tileWidth*w, tileHeight*h, tileWidth*(w+1), tileHeight*(h+1))
			
 
				+			images = append(images, img.(interface {
			
 
				+				SubImage(image.Rectangle) image.Image
			
 
				+			}).SubImage(rect))
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return images
			
 
				+}
			
 
				+
			
 
				+// remove the "alpha" channel by drawing over a prefilled image
			
 
				+//
			
 
				+// remove the "alpha" channel by drawing over a prefilled image
			
 
				+//
			
 
				+//nolint:unused
			
 
				+func (p *ImageProcessor) compositeImage(img image.Image) image.Image {
			
 
				+	dst := image.NewRGBA(img.Bounds())
			
 
				+
			
 
				+	white := color.RGBA{255, 255, 255, 255}
			
 
				+	draw.Draw(dst, dst.Bounds(), &image.Uniform{white}, image.Point{}, draw.Src)
			
 
				+	draw.Draw(dst, dst.Bounds(), img, img.Bounds().Min, draw.Over)
			
 
				+
			
 
				+	return dst
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) resize(img image.Image, outputSize image.Point, maxImageTiles int) (image.Image, image.Point) {
			
 
				+	b := img.Bounds()
			
 
				+	tileSize := outputSize.Y
			
 
				+
			
 
				+	canvasSize := p.optimalTiledCanvas(b.Max, maxImageTiles, tileSize)
			
 
				+	aspectRatio := image.Point{canvasSize.X / tileSize, canvasSize.Y / tileSize}
			
 
				+	newSize := p.fitToCanvas(b.Max, canvasSize, tileSize)
			
 
				+
			
 
				+	dst := image.NewRGBA(image.Rect(0, 0, newSize.X, newSize.Y))
			
 
				+
			
 
				+	// scaling choices:
			
 
				+	//   NearestNeighbor	fast, blocky output
			
 
				+	//   ApproxBiLinear	fast, medium quality
			
 
				+	//   BiLinear		slow, high quality
			
 
				+	//   CatmullRom		very slow, very high quality
			
 
				+	draw.BiLinear.Scale(dst, dst.Rect, img, b, draw.Over, nil)
			
 
				+
			
 
				+	return dst, aspectRatio
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) pad(img image.Image, outputSize, aspectRatio image.Point) image.Image {
			
 
				+	paddedSize := image.Point{
			
 
				+		X: outputSize.X * aspectRatio.X,
			
 
				+		Y: outputSize.Y * aspectRatio.Y,
			
 
				+	}
			
 
				+
			
 
				+	dst := image.NewRGBA(image.Rect(0, 0, paddedSize.X, paddedSize.Y))
			
 
				+	draw.Draw(dst, img.Bounds(), img, image.Point{0, 0}, draw.Over)
			
 
				+
			
 
				+	return dst
			
 
				+}
			
 
				+
			
 
				+func (p *ImageProcessor) pack(img image.Image, aspectRatio image.Point, mean, std [3]float32) []float32 {
			
 
				+	subImages := p.splitToTiles(img, aspectRatio)
			
 
				+
			
 
				+	var pixelVals []float32
			
 
				+
			
 
				+	for _, subImg := range subImages {
			
 
				+		bounds := subImg.Bounds()
			
 
				+		var rVals, gVals, bVals []float32
			
 
				+		for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
			
 
				+			for x := bounds.Min.X; x < bounds.Max.X; x++ {
			
 
				+				c := subImg.At(x, y)
			
 
				+				r, g, b, _ := c.RGBA()
			
 
				+				rVal := float32(r>>8) / 255.0
			
 
				+				gVal := float32(g>>8) / 255.0
			
 
				+				bVal := float32(b>>8) / 255.0
			
 
				+
			
 
				+				rVal = (rVal - mean[0]) / std[0]
			
 
				+				gVal = (gVal - mean[1]) / std[1]
			
 
				+				bVal = (bVal - mean[2]) / std[2]
			
 
				+
			
 
				+				rVals = append(rVals, rVal)
			
 
				+				gVals = append(gVals, gVal)
			
 
				+				bVals = append(bVals, bVal)
			
 
				+			}
			
 
				+		}
			
 
				+		pixelVals = append(pixelVals, rVals...)
			
 
				+		pixelVals = append(pixelVals, gVals...)
			
 
				+		pixelVals = append(pixelVals, bVals...)
			
 
				+	}
			
 
				+
			
 
				+	return pixelVals
			
 
				+}
			
 
				+
			
 
				+func (p ImageProcessor) ProcessImage(img image.Image) ([]float32, int, error) {
			
 
				+	outputSize := image.Point{p.imageSize, p.imageSize}
			
 
				+
			
 
				+	// clip values
			
 
				+	mean := [3]float32{0.48145466, 0.4578275, 0.40821073}
			
 
				+	std := [3]float32{0.26862954, 0.26130258, 0.27577711}
			
 
				+
			
 
				+	newImage, aspectRatio := p.resize(img, outputSize, p.maxNumTiles)
			
 
				+	newImage = p.pad(newImage, outputSize, aspectRatio)
			
 
				+
			
 
				+	data := p.pack(newImage, aspectRatio, mean, std)
			
 
				+	aspectRatioIndex := slices.Index(p.supportedAspectRatios(p.maxNumTiles), aspectRatio) + 1
			
 
				+	return data, aspectRatioIndex, nil
			
 
				+}
			
--- a/model/mllama/process_text.go
+++ b/model/mllama/process_text.go
@@ -0,0 +1,25 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+)
			
 
				+
			
 
				+type TextProcessor struct {
			
 
				+	model.BytePairEncoding
			
 
				+}
			
 
				+
			
 
				+func newTextProcessor(c ml.Config) TextProcessor {
			
 
				+	return TextProcessor{
			
 
				+		BytePairEncoding: model.BytePairEncoding{
			
 
				+			Pretokenizer: c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
			
 
				+			Vocabulary: &model.Vocabulary{
			
 
				+				Values: c.Strings("tokenizer.ggml.tokens"),
			
 
				+				Types:  c.Uints("tokenizer.ggml.token_type"),
			
 
				+				Merges: c.Strings("tokenizer.ggml.merges"),
			
 
				+				BOS:    c.Uint("tokenizer.ggml.bos_token_id"),
			
 
				+				EOS:    c.Uint("tokenizer.ggml.eos_token_id"),
			
 
				+			},
			
 
				+		},
			
 
				+	}
			
 
				+}
			
--- a/model/mllama/process_text_test.go
+++ b/model/mllama/process_text_test.go
@@ -0,0 +1,87 @@
 
				+package mllama
			
 
				+
			
 
				+import (
			
 
				+	"encoding/json"
			
 
				+	"errors"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"strconv"
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/google/go-cmp/cmp"
			
 
				+	"github.com/google/go-cmp/cmp/cmpopts"
			
 
				+
			
 
				+	"github.com/ollama/ollama/model"
			
 
				+)
			
 
				+
			
 
				+func TestProcessText(t *testing.T) {
			
 
				+	ours, err := model.New(filepath.Join("testdata", "model.bin"))
			
 
				+	if errors.Is(err, os.ErrNotExist) {
			
 
				+		t.Skip("no model.bin")
			
 
				+	} else if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	t.Run("decode", func(t *testing.T) {
			
 
				+		f, err := os.Open(filepath.Join("testdata", "theirs.json"))
			
 
				+		if errors.Is(err, os.ErrNotExist) {
			
 
				+			t.Skip("no theirs.json")
			
 
				+		} else if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+		defer f.Close()
			
 
				+
			
 
				+		var theirs [][]byte
			
 
				+		if err := json.NewDecoder(f).Decode(&theirs); err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		for id := range theirs {
			
 
				+			ids := []int32{int32(id)}
			
 
				+			s, err := ours.(model.TextProcessor).Decode(ids)
			
 
				+			if err != nil {
			
 
				+				t.Fatal(err)
			
 
				+			}
			
 
				+
			
 
				+			if diff := cmp.Diff(string(theirs[id]), s); diff != "" {
			
 
				+				t.Errorf("%d no match (-theirs +ours):\n%s", id, diff)
			
 
				+			}
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("encode", func(t *testing.T) {
			
 
				+		f, err := os.Open(filepath.Join("..", "testdata", "inputs.json"))
			
 
				+		if errors.Is(err, os.ErrNotExist) {
			
 
				+			t.Skip("no inputs.json")
			
 
				+		} else if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+		defer f.Close()
			
 
				+
			
 
				+		var inputs []struct {
			
 
				+			Values []byte  `json:"base64"`
			
 
				+			IDs    []int32 `json:"ids"`
			
 
				+		}
			
 
				+
			
 
				+		if err := json.NewDecoder(f).Decode(&inputs); err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		for i, input := range inputs {
			
 
				+			if i == 45 {
			
 
				+				t.Skip("skip 45")
			
 
				+			}
			
 
				+
			
 
				+			t.Run(strconv.Itoa(i), func(t *testing.T) {
			
 
				+				ids, err := ours.(model.TextProcessor).Encode(string(input.Values))
			
 
				+				if err != nil {
			
 
				+					t.Fatal(err)
			
 
				+				}
			
 
				+
			
 
				+				if diff := cmp.Diff(input.IDs, ids, cmpopts.EquateEmpty()); diff != "" {
			
 
				+					t.Errorf("%s: no match (-theirs +ours):\n%s", input.Values, diff)
			
 
				+				}
			
 
				+			})
			
 
				+		}
			
 
				+	})
			
 
				+}
			
--- a/model/mllama/testdata/model.bin
+++ b/model/mllama/testdata/model.bin
@@ -0,0 +1 @@
 
				+/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf
			
--- a/model/mllama/testdata/theirs.json
+++ b/model/mllama/testdata/theirs.json
--- a/model/model.go
+++ b/model/model.go
@@ -0,0 +1,279 @@
 
				+package model
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"image"
			
 
				+	_ "image/jpeg"
			
 
				+	_ "image/png"
			
 
				+	"log/slog"
			
 
				+	"os"
			
 
				+	"reflect"
			
 
				+	"strconv"
			
 
				+	"strings"
			
 
				+
			
 
				+	_ "golang.org/x/image/bmp"
			
 
				+	_ "golang.org/x/image/tiff"
			
 
				+	_ "golang.org/x/image/webp"
			
 
				+
			
 
				+	"github.com/ollama/ollama/cache"
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	_ "github.com/ollama/ollama/ml/backend"
			
 
				+)
			
 
				+
			
 
				+type Cache struct {
			
 
				+	cache.Cache
			
 
				+	cache.Options
			
 
				+}
			
 
				+
			
 
				+func (c Cache) Sub(i int) Cache {
			
 
				+	if c.Cache != nil {
			
 
				+		return Cache{
			
 
				+			Cache:   c.Cache.Sub(i),
			
 
				+			Options: c.Options,
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return c
			
 
				+}
			
 
				+
			
 
				+func (c Cache) Put(ctx ml.Context, key, value ml.Tensor, opts cache.Options) (ml.Tensor, ml.Tensor) {
			
 
				+	if c.Cache != nil {
			
 
				+		return c.Cache.Put(ctx, key, value, opts)
			
 
				+	}
			
 
				+
			
 
				+	return key, value
			
 
				+}
			
 
				+
			
 
				+type Options struct {
			
 
				+	inputs []int32
			
 
				+
			
 
				+	Offset int
			
 
				+
			
 
				+	Images []image.Image
			
 
				+
			
 
				+	Cache
			
 
				+}
			
 
				+
			
 
				+func (opts Options) Inputs() []int32 {
			
 
				+	return opts.inputs[opts.Offset:]
			
 
				+}
			
 
				+
			
 
				+func (opts Options) Positions() []int32 {
			
 
				+	positions := make([]int32, len(opts.inputs)-opts.Offset)
			
 
				+	for i := range positions {
			
 
				+		positions[i] = int32(opts.Offset + i)
			
 
				+	}
			
 
				+
			
 
				+	return positions
			
 
				+}
			
 
				+
			
 
				+type OptionsFunc func(Model, *Options)
			
 
				+
			
 
				+func WithInputIDs(ids []int32) OptionsFunc {
			
 
				+	return func(m Model, opts *Options) {
			
 
				+		opts.inputs = ids
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func WithOffset(offset int) OptionsFunc {
			
 
				+	return func(m Model, opts *Options) {
			
 
				+		opts.Offset = offset
			
 
				+		opts.Cache.Position = offset
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func WithImage(img image.Image) OptionsFunc {
			
 
				+	return func(m Model, opts *Options) {
			
 
				+		opts.Images = append(opts.Images, img)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func WithCache(c cache.Cache) OptionsFunc {
			
 
				+	return func(m Model, opts *Options) {
			
 
				+		opts.Cache = Cache{
			
 
				+			Cache: c,
			
 
				+			Options: cache.Options{
			
 
				+				Position: opts.Offset,
			
 
				+			},
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+type Base struct {
			
 
				+	b ml.Backend
			
 
				+}
			
 
				+
			
 
				+func (m *Base) Backend() ml.Backend {
			
 
				+	return m.b
			
 
				+}
			
 
				+
			
 
				+type Model interface {
			
 
				+	Forward(ml.Context, Options) (ml.Tensor, error)
			
 
				+
			
 
				+	Backend() ml.Backend
			
 
				+}
			
 
				+
			
 
				+var models = make(map[string]func(ml.Config) (Model, error))
			
 
				+
			
 
				+func Register(name string, f func(ml.Config) (Model, error)) {
			
 
				+	if _, ok := models[name]; ok {
			
 
				+		panic("model: model already registered")
			
 
				+	}
			
 
				+
			
 
				+	models[name] = f
			
 
				+}
			
 
				+
			
 
				+func New(s string) (Model, error) {
			
 
				+	r, err := os.Open(s)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	defer r.Close()
			
 
				+
			
 
				+	b, err := ml.NewBackend(r)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	arch := b.Config().Architecture()
			
 
				+	f, ok := models[arch]
			
 
				+	if !ok {
			
 
				+		return nil, fmt.Errorf("unsupported model architecture %q", arch)
			
 
				+	}
			
 
				+
			
 
				+	m, err := f(b.Config())
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	v := reflect.ValueOf(m)
			
 
				+	v.Elem().Set(populateFields(b, v))
			
 
				+	return m, nil
			
 
				+}
			
 
				+
			
 
				+func populateFields(b ml.Backend, v reflect.Value, tags ...Tag) reflect.Value {
			
 
				+	t := v.Type()
			
 
				+	if t.Kind() == reflect.Pointer {
			
 
				+		t, v = t.Elem(), v.Elem()
			
 
				+	}
			
 
				+
			
 
				+	if t.Kind() == reflect.Struct {
			
 
				+		allNil := true
			
 
				+		for i := range t.NumField() {
			
 
				+			tt := t.Field(i).Type
			
 
				+			vv := v.Field(i)
			
 
				+			if !vv.CanSet() {
			
 
				+				continue
			
 
				+			}
			
 
				+
			
 
				+			// make a copy
			
 
				+			tagsCopy := tags
			
 
				+			if tag := t.Field(i).Tag.Get("gguf"); tag != "" {
			
 
				+				tagsCopy = append(tagsCopy, ParseTags(tag))
			
 
				+			}
			
 
				+
			
 
				+			if tt == reflect.TypeOf((*Base)(nil)).Elem() {
			
 
				+				vv.Set(reflect.ValueOf(Base{b: b}))
			
 
				+			} else if tt == reflect.TypeOf((*ml.Tensor)(nil)).Elem() {
			
 
				+				var fn func([]Tag) [][]string
			
 
				+				fn = func(tags []Tag) (values [][]string) {
			
 
				+					if len(tags) < 1 {
			
 
				+						return nil
			
 
				+					}
			
 
				+
			
 
				+					values = [][]string{{tags[0].Name}}
			
 
				+					for _, alt := range tags[0].Alternate {
			
 
				+						values = append(values, []string{alt})
			
 
				+					}
			
 
				+
			
 
				+					for i, value := range values {
			
 
				+						for _, rest := range fn(tags[1:]) {
			
 
				+							value = append(value, rest...)
			
 
				+						}
			
 
				+
			
 
				+						values[i] = value
			
 
				+					}
			
 
				+
			
 
				+					return values
			
 
				+				}
			
 
				+
			
 
				+				names := fn(tagsCopy)
			
 
				+				for _, name := range names {
			
 
				+					if tensor := b.Get(strings.Join(name, ".")); tensor != nil {
			
 
				+						slog.Debug("found tensor", "", tensor)
			
 
				+						vv.Set(reflect.ValueOf(tensor))
			
 
				+						break
			
 
				+					}
			
 
				+				}
			
 
				+			} else if tt.Kind() == reflect.Pointer {
			
 
				+				vvv := vv.Elem()
			
 
				+				if vv.IsNil() {
			
 
				+					vvv = reflect.New(tt.Elem())
			
 
				+				}
			
 
				+
			
 
				+				if f := populateFields(b, vvv, tagsCopy...); f.CanAddr() {
			
 
				+					vv.Set(f.Addr())
			
 
				+				}
			
 
				+			} else if tt.Kind() == reflect.Slice || tt.Kind() == reflect.Array {
			
 
				+				for i := range vv.Len() {
			
 
				+					vv.Index(i).Set(populateFields(b, vv.Index(i), append(tagsCopy, Tag{Name: strconv.Itoa(i)})...))
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if !canNil(tt) || !vv.IsNil() {
			
 
				+				allNil = false
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if allNil {
			
 
				+			return reflect.Zero(t)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return v
			
 
				+}
			
 
				+
			
 
				+type Tag struct {
			
 
				+	Name      string
			
 
				+	Alternate []string
			
 
				+}
			
 
				+
			
 
				+func ParseTags(s string) (tag Tag) {
			
 
				+	parts := strings.Split(s, ",")
			
 
				+	if len(parts) > 0 {
			
 
				+		tag.Name = parts[0]
			
 
				+
			
 
				+		for _, part := range parts[1:] {
			
 
				+			if value, ok := strings.CutPrefix(part, "alt:"); ok {
			
 
				+				tag.Alternate = append(tag.Alternate, value)
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+func canNil(t reflect.Type) bool {
			
 
				+	return t.Kind() == reflect.Chan ||
			
 
				+		t.Kind() == reflect.Func ||
			
 
				+		t.Kind() == reflect.Interface ||
			
 
				+		t.Kind() == reflect.Map ||
			
 
				+		t.Kind() == reflect.Pointer ||
			
 
				+		t.Kind() == reflect.Slice
			
 
				+}
			
 
				+
			
 
				+func Forward(m Model, optsFuncs ...OptionsFunc) (ml.Tensor, error) {
			
 
				+	var opts Options
			
 
				+	for _, optsFunc := range optsFuncs {
			
 
				+		optsFunc(m, &opts)
			
 
				+	}
			
 
				+
			
 
				+	ctx := m.Backend().NewContext()
			
 
				+	t, err := m.Forward(ctx, opts)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	defer ctx.Close()
			
 
				+
			
 
				+	return ctx.Compute(t), nil
			
 
				+}
			
--- a/model/model_test.go
+++ b/model/model_test.go
@@ -0,0 +1,136 @@
 
				+package model
			
 
				+
			
 
				+import (
			
 
				+	"reflect"
			
 
				+	"slices"
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/google/go-cmp/cmp"
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+	"github.com/ollama/ollama/ml/backend/ggml"
			
 
				+	"github.com/ollama/ollama/ml/nn"
			
 
				+)
			
 
				+
			
 
				+func TestParseTags(t *testing.T) {
			
 
				+	cases := []struct {
			
 
				+		value string
			
 
				+		want  Tag
			
 
				+	}{
			
 
				+		{
			
 
				+			value: "output",
			
 
				+			want: Tag{
			
 
				+				Name: "output",
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			value: "output,alt:token_embd",
			
 
				+			want: Tag{
			
 
				+				Name: "output",
			
 
				+				Alternate: []string{
			
 
				+					"token_embd",
			
 
				+				},
			
 
				+			},
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tt := range cases {
			
 
				+		t.Run(tt.value, func(t *testing.T) {
			
 
				+			got := ParseTags(tt.value)
			
 
				+			if diff := cmp.Diff(tt.want, got); diff != "" {
			
 
				+				t.Errorf("ParseTags() returned unexpected values (-want +got):\n%s", diff)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+type fakeBackend struct {
			
 
				+	*ggml.Backend
			
 
				+	names []string
			
 
				+}
			
 
				+
			
 
				+type fakeTensor struct {
			
 
				+	*ggml.Tensor
			
 
				+	Name string
			
 
				+}
			
 
				+
			
 
				+func (m *fakeBackend) Get(name string) ml.Tensor {
			
 
				+	if slices.Contains(m.names, name) {
			
 
				+		return &fakeTensor{Name: name}
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func TestPopulateFields(t *testing.T) {
			
 
				+	type fakeLayer struct {
			
 
				+		Query  *nn.Linear `gguf:"attn_q"`
			
 
				+		Key    *nn.Linear `gguf:"attn_k"`
			
 
				+		Value  *nn.Linear `gguf:"attn_v"`
			
 
				+		Output *nn.Linear `gguf:"attn_o"`
			
 
				+	}
			
 
				+
			
 
				+	type fakeModel struct {
			
 
				+		Input      *nn.Embedding `gguf:"input"`
			
 
				+		OutputNorm *nn.RMSNorm   `gguf:"output_norm"`
			
 
				+		Output     *nn.Linear    `gguf:"output"`
			
 
				+		Layers     [2]fakeLayer  `gguf:"blk"`
			
 
				+	}
			
 
				+
			
 
				+	var m fakeModel
			
 
				+	v := reflect.ValueOf(&m)
			
 
				+	v.Elem().Set(populateFields(&fakeBackend{
			
 
				+		names: []string{
			
 
				+			"input.weight",
			
 
				+			"blk.0.attn_q.weight",
			
 
				+			"blk.0.attn_k.weight",
			
 
				+			"blk.0.attn_v.weight",
			
 
				+			"blk.1.attn_q.weight",
			
 
				+			"blk.1.attn_k.weight",
			
 
				+			"blk.1.attn_v.weight",
			
 
				+			"output_norm.weight",
			
 
				+			"output.weight",
			
 
				+		},
			
 
				+	}, v))
			
 
				+
			
 
				+	if diff := cmp.Diff(fakeModel{
			
 
				+		Input:      &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
			
 
				+		OutputNorm: &nn.RMSNorm{Weight: &fakeTensor{Name: "output_norm.weight"}},
			
 
				+		Output:     &nn.Linear{Weight: &fakeTensor{Name: "output.weight"}},
			
 
				+		Layers: [2]fakeLayer{
			
 
				+			{
			
 
				+				Query: &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_q.weight"}},
			
 
				+				Key:   &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_k.weight"}},
			
 
				+				Value: &nn.Linear{Weight: &fakeTensor{Name: "blk.0.attn_v.weight"}},
			
 
				+			},
			
 
				+			{
			
 
				+				Query: &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_q.weight"}},
			
 
				+				Key:   &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_k.weight"}},
			
 
				+				Value: &nn.Linear{Weight: &fakeTensor{Name: "blk.1.attn_v.weight"}},
			
 
				+			},
			
 
				+		},
			
 
				+	}, m); diff != "" {
			
 
				+		t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestPopulateFieldsAlternateName(t *testing.T) {
			
 
				+	type fakeModel struct {
			
 
				+		Input  *nn.Embedding `gguf:"input"`
			
 
				+		Output *nn.Linear    `gguf:"output,alt:input"`
			
 
				+	}
			
 
				+
			
 
				+	m := fakeModel{}
			
 
				+	v := reflect.ValueOf(&m)
			
 
				+	v.Elem().Set(populateFields(&fakeBackend{
			
 
				+		names: []string{
			
 
				+			"input.weight",
			
 
				+		},
			
 
				+	}, v))
			
 
				+
			
 
				+	if diff := cmp.Diff(fakeModel{
			
 
				+		Input:  &nn.Embedding{Weight: &fakeTensor{Name: "input.weight"}},
			
 
				+		Output: &nn.Linear{Weight: &fakeTensor{Name: "input.weight"}},
			
 
				+	}, m); diff != "" {
			
 
				+		t.Errorf("populateFields() set incorrect values (-want +got):\n%s", diff)
			
 
				+	}
			
 
				+}
			
--- a/model/process_text.go
+++ b/model/process_text.go
@@ -0,0 +1,312 @@
 
				+package model
			
 
				+
			
 
				+import (
			
 
				+	"cmp"
			
 
				+	"log/slog"
			
 
				+	"strings"
			
 
				+	"sync"
			
 
				+
			
 
				+	"github.com/dlclark/regexp2"
			
 
				+	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
			
 
				+)
			
 
				+
			
 
				+type Special int32
			
 
				+
			
 
				+const (
			
 
				+	SpecialBOS Special = iota
			
 
				+	SpecialEOS
			
 
				+)
			
 
				+
			
 
				+type TextProcessor interface {
			
 
				+	Encode(string) ([]int32, error)
			
 
				+	Decode([]int32) (string, error)
			
 
				+	Is(uint32, Special) bool
			
 
				+}
			
 
				+
			
 
				+type Vocabulary struct {
			
 
				+	Values []string
			
 
				+	Types  []uint32
			
 
				+	Scores []uint32
			
 
				+	Merges []string
			
 
				+
			
 
				+	BOS, EOS uint32
			
 
				+
			
 
				+	specialOnce sync.Once
			
 
				+	special     []string
			
 
				+
			
 
				+	valuesOnce sync.Once
			
 
				+	values     map[string]int32
			
 
				+
			
 
				+	mergeOnce sync.Once
			
 
				+	merge     map[string]int32
			
 
				+}
			
 
				+
			
 
				+func (v *Vocabulary) Is(id uint32, special Special) bool {
			
 
				+	switch special {
			
 
				+	case SpecialBOS:
			
 
				+		return id == v.BOS
			
 
				+	case SpecialEOS:
			
 
				+		return id == v.EOS
			
 
				+	default:
			
 
				+		return false
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (v *Vocabulary) Encode(s string) int32 {
			
 
				+	v.valuesOnce.Do(func() {
			
 
				+		v.values = make(map[string]int32, len(v.Values))
			
 
				+		for i, value := range v.Values {
			
 
				+			v.values[value] = int32(i)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	if id, ok := v.values[s]; ok {
			
 
				+		return id
			
 
				+	}
			
 
				+
			
 
				+	return -1
			
 
				+}
			
 
				+
			
 
				+func (v *Vocabulary) Decode(id int32) string {
			
 
				+	return v.Values[id]
			
 
				+}
			
 
				+
			
 
				+func (v *Vocabulary) SpecialVocabulary() []string {
			
 
				+	v.specialOnce.Do(func() {
			
 
				+		for i := range v.Values {
			
 
				+			if v.Types[i] == 3 {
			
 
				+				v.special = append(v.special, v.Values[i])
			
 
				+			}
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	return v.special
			
 
				+}
			
 
				+
			
 
				+func (v *Vocabulary) Merge(left, right string) int {
			
 
				+	v.mergeOnce.Do(func() {
			
 
				+		v.merge = make(map[string]int32, len(v.Merges))
			
 
				+		for i, merge := range v.Merges {
			
 
				+			v.merge[merge] = int32(i)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	if id, ok := v.merge[left+" "+right]; ok {
			
 
				+		return int(id)
			
 
				+	}
			
 
				+
			
 
				+	return -1
			
 
				+}
			
 
				+
			
 
				+type BytePairEncoding struct {
			
 
				+	Pretokenizer string
			
 
				+
			
 
				+	*Vocabulary
			
 
				+}
			
 
				+
			
 
				+func (bpe BytePairEncoding) split(s string) ([]string, error) {
			
 
				+	re, err := regexp2.Compile(bpe.Pretokenizer, regexp2.Unicode|regexp2.RE2)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	var matches []string
			
 
				+	for m, _ := re.FindStringMatch(s); m != nil; m, _ = re.FindNextMatch(m) {
			
 
				+		matches = append(matches, m.String())
			
 
				+	}
			
 
				+
			
 
				+	return matches, nil
			
 
				+}
			
 
				+
			
 
				+// fragment is a string fragment and their corresponding token IDs
			
 
				+type fragment struct {
			
 
				+	value string
			
 
				+	ids   []int32
			
 
				+}
			
 
				+
			
 
				+// pair is a pair of runes and its rank
			
 
				+type pair struct {
			
 
				+	a, b  int
			
 
				+	rank  int
			
 
				+	value string
			
 
				+}
			
 
				+
			
 
				+type merge struct {
			
 
				+	p, n  int
			
 
				+	runes []rune
			
 
				+}
			
 
				+
			
 
				+func (bpe BytePairEncoding) Encode(s string) ([]int32, error) {
			
 
				+	fragments := []fragment{{value: s}}
			
 
				+	for _, special := range bpe.Vocabulary.SpecialVocabulary() {
			
 
				+		// TODO: process special tokens concurrently
			
 
				+		id := bpe.Vocabulary.Encode(special)
			
 
				+		for i := 0; i < len(fragments); i++ {
			
 
				+			frag := fragments[i]
			
 
				+			if len(frag.ids) > 0 {
			
 
				+				continue
			
 
				+			}
			
 
				+
			
 
				+			var middle []fragment
			
 
				+			switch i := strings.Index(frag.value, special); {
			
 
				+			case i < 0:
			
 
				+				middle = append(middle, frag)
			
 
				+			case i > 0:
			
 
				+				middle = append(middle, fragment{value: frag.value[:i]})
			
 
				+				fallthrough
			
 
				+			default:
			
 
				+				middle = append(middle, fragment{value: special, ids: []int32{id}})
			
 
				+				if rest := frag.value[i+len(special):]; rest != "" {
			
 
				+					middle = append(middle, fragment{value: rest})
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			fragments = append(fragments[:i], append(middle, fragments[i+1:]...)...)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	var ids []int32
			
 
				+	for _, frag := range fragments {
			
 
				+		if len(frag.ids) > 0 {
			
 
				+			ids = append(ids, frag.ids...)
			
 
				+			slog.Debug("encoded", "text", frag.value, "ids", frag.ids, "special", true)
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// split fragment using pretokenizer
			
 
				+		splits, err := bpe.split(frag.value)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+
			
 
				+		for _, split := range splits {
			
 
				+			// TODO: process splits concurrently
			
 
				+			var sb strings.Builder
			
 
				+			for _, b := range []byte(split) {
			
 
				+				r := rune(b)
			
 
				+				switch {
			
 
				+				case r == 0x00ad:
			
 
				+					r = 0x0143
			
 
				+				case r <= 0x0020:
			
 
				+					r = r + 0x0100
			
 
				+				case r >= 0x007e && r <= 0x00a0:
			
 
				+					r = r + 0x00a2
			
 
				+				}
			
 
				+
			
 
				+				sb.WriteRune(r)
			
 
				+			}
			
 
				+
			
 
				+			// short circuit if the fragment is in the vocabulary
			
 
				+			if id := bpe.Vocabulary.Encode(sb.String()); id >= 0 {
			
 
				+				ids = append(ids, id)
			
 
				+				slog.Debug("encoded", "text", sb.String(), "ids", []int32{id})
			
 
				+				continue
			
 
				+			}
			
 
				+
			
 
				+			runes := []rune(sb.String())
			
 
				+			merges := make([]merge, len(runes))
			
 
				+			for r := range runes {
			
 
				+				merges[r] = merge{
			
 
				+					p:     r - 1,
			
 
				+					n:     r + 1,
			
 
				+					runes: []rune{runes[r]},
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			pairwise := func(a, b int) *pair {
			
 
				+				if a < 0 || b >= len(runes) {
			
 
				+					return nil
			
 
				+				}
			
 
				+
			
 
				+				left, right := string(merges[a].runes), string(merges[b].runes)
			
 
				+				rank := bpe.Vocabulary.Merge(left, right)
			
 
				+				if rank < 0 {
			
 
				+					return nil
			
 
				+				}
			
 
				+
			
 
				+				return &pair{
			
 
				+					a:     a,
			
 
				+					b:     b,
			
 
				+					rank:  rank,
			
 
				+					value: left + right,
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			pairs := heap.NewWith(func(i, j *pair) int {
			
 
				+				return cmp.Compare(i.rank, j.rank)
			
 
				+			})
			
 
				+
			
 
				+			for i := range len(runes) - 1 {
			
 
				+				if pair := pairwise(i, i+1); pair != nil {
			
 
				+					pairs.Push(pair)
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			for !pairs.Empty() {
			
 
				+				pair, _ := pairs.Pop()
			
 
				+
			
 
				+				left, right := merges[pair.a], merges[pair.b]
			
 
				+				if len(left.runes) == 0 || len(right.runes) == 0 ||
			
 
				+					string(left.runes)+string(right.runes) != pair.value {
			
 
				+					continue
			
 
				+				}
			
 
				+
			
 
				+				merges[pair.a].runes = append(left.runes, right.runes...)
			
 
				+				merges[pair.b].runes = nil
			
 
				+
			
 
				+				merges[pair.a].n = right.n
			
 
				+				if right.n < len(merges) {
			
 
				+					merges[right.n].p = pair.a
			
 
				+				}
			
 
				+
			
 
				+				if pair := pairwise(merges[pair.a].p, pair.a); pair != nil {
			
 
				+					pairs.Push(pair)
			
 
				+				}
			
 
				+
			
 
				+				if pair := pairwise(pair.a, merges[pair.a].n); pair != nil {
			
 
				+					pairs.Push(pair)
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			for _, merge := range merges {
			
 
				+				if len(merge.runes) > 0 {
			
 
				+					// TODO: handle the edge case where the rune isn't in the vocabulary
			
 
				+					if id := bpe.Vocabulary.Encode(string(merge.runes)); id >= 0 {
			
 
				+						ids = append(ids, id)
			
 
				+						slog.Debug("encoded", "text", string(merge.runes), "ids", []int32{id})
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return ids, nil
			
 
				+}
			
 
				+
			
 
				+func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
			
 
				+	var sb strings.Builder
			
 
				+	for _, id := range ids {
			
 
				+		for _, r := range bpe.Vocabulary.Decode(id) {
			
 
				+			switch {
			
 
				+			case r == 0x0100:
			
 
				+				// this produces 0x00 aka NULL
			
 
				+				continue
			
 
				+			case r == 0x0143:
			
 
				+				r = 0x00ad
			
 
				+			case r > 0x0100 && r <= 0x0120:
			
 
				+				r = r - 0x0100
			
 
				+			case r > 0x0120 && r <= 0x0142:
			
 
				+				r = r - 0x00a2
			
 
				+			}
			
 
				+
			
 
				+			// NOTE: not using WriteRune here because it writes the UTF-8
			
 
				+			// encoding of the rune which is _not_ what we want
			
 
				+			if err := sb.WriteByte(byte(r)); err != nil {
			
 
				+				return "", err
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	slog.Debug("decoded", "ids", ids, "text", sb.String())
			
 
				+	return sb.String(), nil
			
 
				+}
			
--- a/model/testdata/inputs.json
+++ b/model/testdata/inputs.json
@@ -0,0 +1,586 @@
 
				+[
			
 
				+    {
			
 
				+        "base64": "aWVkIDQgwr0gbW9udGhz",
			
 
				+        "ids": [
			
 
				+            1142,
			
 
				+            220,
			
 
				+            19,
			
 
				+            220,
			
 
				+            27154,
			
 
				+            4038
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "RsO8aHJlcg==",
			
 
				+        "ids": [
			
 
				+            37,
			
 
				+            51853,
			
 
				+            261
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "",
			
 
				+        "ids": []
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IA==",
			
 
				+        "ids": [
			
 
				+            220
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICA=",
			
 
				+        "ids": [
			
 
				+            256
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICAg",
			
 
				+        "ids": [
			
 
				+            262
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "CQ==",
			
 
				+        "ids": [
			
 
				+            197
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "Cg==",
			
 
				+        "ids": [
			
 
				+            198
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "Cgo=",
			
 
				+        "ids": [
			
 
				+            271
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "CgoK",
			
 
				+        "ids": [
			
 
				+            1432
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "CQo=",
			
 
				+        "ids": [
			
 
				+            1602
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "SGVsbG8gd29ybGQ=",
			
 
				+        "ids": [
			
 
				+            9906,
			
 
				+            1917
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IEhlbGxvIHdvcmxk",
			
 
				+        "ids": [
			
 
				+            22691,
			
 
				+            1917
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "SGVsbG8gV29ybGQ=",
			
 
				+        "ids": [
			
 
				+            9906,
			
 
				+            4435
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IEhlbGxvIFdvcmxk",
			
 
				+        "ids": [
			
 
				+            22691,
			
 
				+            4435
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IEhlbGxvIFdvcmxkIQ==",
			
 
				+        "ids": [
			
 
				+            22691,
			
 
				+            4435,
			
 
				+            0
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "SGVsbG8sIHdvcmxkIQ==",
			
 
				+        "ids": [
			
 
				+            9906,
			
 
				+            11,
			
 
				+            1917,
			
 
				+            0
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IEhlbGxvLCB3b3JsZCE=",
			
 
				+        "ids": [
			
 
				+            22691,
			
 
				+            11,
			
 
				+            1917,
			
 
				+            0
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IHRoaXMgaXMg8J+mmS5jcHA=",
			
 
				+        "ids": [
			
 
				+            420,
			
 
				+            374,
			
 
				+            11410,
			
 
				+            99,
			
 
				+            247,
			
 
				+            13,
			
 
				+            11055
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "dzA0OCA3dHVpamsgZHNkZmh1",
			
 
				+        "ids": [
			
 
				+            86,
			
 
				+            23904,
			
 
				+            220,
			
 
				+            22,
			
 
				+            83,
			
 
				+            2005,
			
 
				+            42908,
			
 
				+            11729,
			
 
				+            3013,
			
 
				+            17156
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "0L3QtdGJ0L4g0L3QsCDQkdGK0LvQs9Cw0YDRgdC60Lg=",
			
 
				+        "ids": [
			
 
				+            79862,
			
 
				+            102118,
			
 
				+            13373,
			
 
				+            64571,
			
 
				+            34694,
			
 
				+            3114,
			
 
				+            112203,
			
 
				+            80112
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "4Z6A4Z624Z6T4Z+L4Z6P4Z+C4Z6W4Z634Z6f4Z+B4Z6f4Z6i4Z624Z6F4Z6B4Z6b4Z6F4Z+B4Z6J",
			
 
				+        "ids": [
			
 
				+            21549,
			
 
				+            222,
			
 
				+            98629,
			
 
				+            241,
			
 
				+            45358,
			
 
				+            233,
			
 
				+            21549,
			
 
				+            237,
			
 
				+            45358,
			
 
				+            224,
			
 
				+            21549,
			
 
				+            244,
			
 
				+            21549,
			
 
				+            115,
			
 
				+            21549,
			
 
				+            253,
			
 
				+            45358,
			
 
				+            223,
			
 
				+            21549,
			
 
				+            253,
			
 
				+            21549,
			
 
				+            95,
			
 
				+            98629,
			
 
				+            227,
			
 
				+            21549,
			
 
				+            223,
			
 
				+            21549,
			
 
				+            249,
			
 
				+            21549,
			
 
				+            227,
			
 
				+            45358,
			
 
				+            223,
			
 
				+            21549,
			
 
				+            231
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "8J+agCAobm9ybWFsKSDwn5i24oCN8J+Mq++4jyAobXVsdGlwbGUgZW1vamlzIGNvbmNhdGVuYXRlZCkg4pyFIChvbmx5IGVtb2ppIHRoYXQgaGFzIGl0cyBvd24gdG9rZW4p",
			
 
				+        "ids": [
			
 
				+            9468,
			
 
				+            248,
			
 
				+            222,
			
 
				+            320,
			
 
				+            8416,
			
 
				+            8,
			
 
				+            27623,
			
 
				+            114,
			
 
				+            102470,
			
 
				+            9468,
			
 
				+            234,
			
 
				+            104,
			
 
				+            31643,
			
 
				+            320,
			
 
				+            36773,
			
 
				+            100166,
			
 
				+            98634,
			
 
				+            8,
			
 
				+            26602,
			
 
				+            227,
			
 
				+            320,
			
 
				+            3323,
			
 
				+            43465,
			
 
				+            430,
			
 
				+            706,
			
 
				+            1202,
			
 
				+            1866,
			
 
				+            4037,
			
 
				+            8
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "SGVsbG8=",
			
 
				+        "ids": [
			
 
				+            9906
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IEhlbGxv",
			
 
				+        "ids": [
			
 
				+            22691
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICBIZWxsbw==",
			
 
				+        "ids": [
			
 
				+            220,
			
 
				+            22691
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICAgSGVsbG8=",
			
 
				+        "ids": [
			
 
				+            256,
			
 
				+            22691
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICAgIEhlbGxv",
			
 
				+        "ids": [
			
 
				+            262,
			
 
				+            22691
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICAgIEhlbGxvCiAgICBIZWxsbw==",
			
 
				+        "ids": [
			
 
				+            262,
			
 
				+            22691,
			
 
				+            198,
			
 
				+            262,
			
 
				+            22691
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ICg=",
			
 
				+        "ids": [
			
 
				+            320
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "CiA9",
			
 
				+        "ids": [
			
 
				+            198,
			
 
				+            284
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "JyBlcmE=",
			
 
				+        "ids": [
			
 
				+            6,
			
 
				+            11639
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "SGVsbG8sIHknYWxsISBIb3cgYXJlIHlvdSDwn5iBID/miJHmg7PlnKhhcHBsZeW3peS9nDEzMTQxNTHlpKnvvZ4=",
			
 
				+        "ids": [
			
 
				+            9906,
			
 
				+            11,
			
 
				+            379,
			
 
				+            65948,
			
 
				+            0,
			
 
				+            2650,
			
 
				+            527,
			
 
				+            499,
			
 
				+            27623,
			
 
				+            223,
			
 
				+            949,
			
 
				+            37046,
			
 
				+            101067,
			
 
				+            19000,
			
 
				+            23182,
			
 
				+            102301,
			
 
				+            9263,
			
 
				+            18136,
			
 
				+            16,
			
 
				+            36827,
			
 
				+            21909
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "ISEhISEh",
			
 
				+        "ids": [
			
 
				+            17523,
			
 
				+            3001
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "Mw==",
			
 
				+        "ids": [
			
 
				+            18
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzM=",
			
 
				+        "ids": [
			
 
				+            1644
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMz",
			
 
				+        "ids": [
			
 
				+            8765
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMw==",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            18
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMzM=",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            1644
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMzMz",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            8765
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMzMzMw==",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            18
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMzMzMzM=",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            1644
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "MzMzMzMzMzMz",
			
 
				+        "ids": [
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            8765
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "Q+G7rWEgVmnhu4d0",
			
 
				+        "ids": [
			
 
				+            34,
			
 
				+            91163,
			
 
				+            101798
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "IGRpc2NhcmRz",
			
 
				+        "ids": [
			
 
				+            2624,
			
 
				+            2402
			
 
				+        ]
			
 
				+    },
			
 
				+    {
			
 
				+        "base64": "CiAKCiAKCgogCSAJCSAJCiAgCiAgIAogICAgCiAgICAgCvCfmoAgKG5vcm1hbCkg8J+YtuKAjfCfjKvvuI8gKG11bHRpcGxlIGVtb2ppcyBjb25jYXRlbmF0ZWQpIOKchSDwn6aZ8J+mmSAzIDMzIDMzMyAzMzMzIDMzMzMzIDMzMzMzMyAzMzMzMzMzIDMzMzMzMzMzIDMuMyAzLi4zIDMuLi4zIOGegOGetuGek+Gfi+Gej+GfguGeluGet+Gen+GfgeGen+GeouGetuGehfCfmIEgP+aIkeaDs+WcqGFwcGxl5bel5L2cMTMxNDE1MeWkqe+9niAtLS0tLS09PT09PT09INC90LXRidC+INC90LAg0JHRitC70LPQsNGA0YHQutC4ICcnJycnJ2BgYGBgYGAiIiIiLi4uLi4uISEhISEhPz8/Pz8/IEkndmUgYmVlbiAndG9sZCBoZSdzIHRoZXJlLCAnUkUgeW91IHN1cmU/ICdNIG5vdCBzdXJlIEknbGwgbWFrZSBpdCwgJ0QgeW91IGxpa2Ugc29tZSB0ZWE/IFdlJ1ZlIGEnbEw=",
			
 
				+        "ids": [
			
 
				+            198,
			
 
				+            4815,
			
 
				+            15073,
			
 
				+            66597,
			
 
				+            8004,
			
 
				+            1602,
			
 
				+            2355,
			
 
				+            79772,
			
 
				+            11187,
			
 
				+            9468,
			
 
				+            248,
			
 
				+            222,
			
 
				+            320,
			
 
				+            8416,
			
 
				+            8,
			
 
				+            27623,
			
 
				+            114,
			
 
				+            102470,
			
 
				+            9468,
			
 
				+            234,
			
 
				+            104,
			
 
				+            31643,
			
 
				+            320,
			
 
				+            36773,
			
 
				+            100166,
			
 
				+            98634,
			
 
				+            8,
			
 
				+            26602,
			
 
				+            227,
			
 
				+            11410,
			
 
				+            99,
			
 
				+            247,
			
 
				+            9468,
			
 
				+            99,
			
 
				+            247,
			
 
				+            220,
			
 
				+            18,
			
 
				+            220,
			
 
				+            1644,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            18,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            1644,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            18,
			
 
				+            220,
			
 
				+            8765,
			
 
				+            8765,
			
 
				+            1644,
			
 
				+            220,
			
 
				+            18,
			
 
				+            13,
			
 
				+            18,
			
 
				+            220,
			
 
				+            18,
			
 
				+            497,
			
 
				+            18,
			
 
				+            220,
			
 
				+            18,
			
 
				+            1131,
			
 
				+            18,
			
 
				+            220,
			
 
				+            21549,
			
 
				+            222,
			
 
				+            98629,
			
 
				+            241,
			
 
				+            45358,
			
 
				+            233,
			
 
				+            21549,
			
 
				+            237,
			
 
				+            45358,
			
 
				+            224,
			
 
				+            21549,
			
 
				+            244,
			
 
				+            21549,
			
 
				+            115,
			
 
				+            21549,
			
 
				+            253,
			
 
				+            45358,
			
 
				+            223,
			
 
				+            21549,
			
 
				+            253,
			
 
				+            21549,
			
 
				+            95,
			
 
				+            98629,
			
 
				+            227,
			
 
				+            76460,
			
 
				+            223,
			
 
				+            949,
			
 
				+            37046,
			
 
				+            101067,
			
 
				+            19000,
			
 
				+            23182,
			
 
				+            102301,
			
 
				+            9263,
			
 
				+            18136,
			
 
				+            16,
			
 
				+            36827,
			
 
				+            21909,
			
 
				+            56560,
			
 
				+            54337,
			
 
				+            19175,
			
 
				+            102118,
			
 
				+            13373,
			
 
				+            64571,
			
 
				+            34694,
			
 
				+            3114,
			
 
				+            112203,
			
 
				+            80112,
			
 
				+            3436,
			
 
				+            106451,
			
 
				+            14196,
			
 
				+            14196,
			
 
				+            74694,
			
 
				+            3089,
			
 
				+            3089,
			
 
				+            29249,
			
 
				+            17523,
			
 
				+            3001,
			
 
				+            27708,
			
 
				+            7801,
			
 
				+            358,
			
 
				+            3077,
			
 
				+            1027,
			
 
				+            364,
			
 
				+            83,
			
 
				+            820,
			
 
				+            568,
			
 
				+            596,
			
 
				+            1070,
			
 
				+            11,
			
 
				+            364,
			
 
				+            793,
			
 
				+            499,
			
 
				+            2771,
			
 
				+            30,
			
 
				+            364,
			
 
				+            44,
			
 
				+            539,
			
 
				+            2771,
			
 
				+            358,
			
 
				+            3358,
			
 
				+            1304,
			
 
				+            433,
			
 
				+            11,
			
 
				+            364,
			
 
				+            35,
			
 
				+            499,
			
 
				+            1093,
			
 
				+            1063,
			
 
				+            15600,
			
 
				+            30,
			
 
				+            1226,
			
 
				+            6,
			
 
				+            43712,
			
 
				+            264,
			
 
				+            64966,
			
 
				+            43
			
 
				+        ]
			
 
				+    }
			
 
				+]
			
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -19,7 +19,7 @@ import (
 
				 	"golang.org/x/text/encoding/unicode"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 func TestParseFileFile(t *testing.T) {
			
@@ -769,7 +769,7 @@ func getSHA256Digest(t *testing.T, r io.Reader) (string, int64) {
 
				 	return fmt.Sprintf("sha256:%x", h.Sum(nil)), n
			
 
				 }
			
 
				 
			
 
				-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
			
 
				+func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
			
 
				 	t.Helper()
			
 
				 
			
 
				 	f, err := os.CreateTemp(t.TempDir(), "testbin.*.gguf")
			
@@ -778,7 +778,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
 
				 	}
			
 
				 	defer f.Close()
			
 
				 
			
 
				-	if err := llm.WriteGGUF(f, kv, ti); err != nil {
			
 
				+	if err := ggml.WriteGGUF(f, kv, ti); err != nil {
			
 
				 		t.Fatal(err)
			
 
				 	}
			
 
				 	// Calculate sha256 of file
			
--- a/sample/greedy.go
+++ b/sample/greedy.go
@@ -0,0 +1,13 @@
 
				+package sample
			
 
				+
			
 
				+import "gonum.org/v1/gonum/floats"
			
 
				+
			
 
				+type greedy struct{}
			
 
				+
			
 
				+func Greedy() Sampler {
			
 
				+	return greedy{}
			
 
				+}
			
 
				+
			
 
				+func (s greedy) Sample(t []float64) ([]float64, error) {
			
 
				+	return []float64{float64(floats.MaxIdx(t))}, nil
			
 
				+}
			
--- a/sample/sample.go
+++ b/sample/sample.go
@@ -0,0 +1,74 @@
 
				+package sample
			
 
				+
			
 
				+import (
			
 
				+	"slices"
			
 
				+
			
 
				+	"gonum.org/v1/gonum/floats"
			
 
				+	"gonum.org/v1/gonum/stat/sampleuv"
			
 
				+)
			
 
				+
			
 
				+type Sampler interface {
			
 
				+	Sample([]float64) ([]float64, error)
			
 
				+}
			
 
				+
			
 
				+type Temperature float64
			
 
				+
			
 
				+func (s Temperature) Sample(t []float64) ([]float64, error) {
			
 
				+	floats.Div(t, slices.Repeat([]float64{float64(s)}, len(t)))
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+type softmax struct{}
			
 
				+
			
 
				+func Softmax() Sampler {
			
 
				+	return softmax{}
			
 
				+}
			
 
				+
			
 
				+func (softmax) Sample(t []float64) ([]float64, error) {
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+type TopK int
			
 
				+
			
 
				+func (s TopK) Sample(t []float64) ([]float64, error) {
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+type TopP float32
			
 
				+
			
 
				+func (s TopP) Sample(t []float64) ([]float64, error) {
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+type MinP float32
			
 
				+
			
 
				+func (s MinP) Sample(t []float64) ([]float64, error) {
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+type weighed struct{}
			
 
				+
			
 
				+func Weighed() Sampler {
			
 
				+	return weighed{}
			
 
				+}
			
 
				+
			
 
				+func (s weighed) Sample(t []float64) ([]float64, error) {
			
 
				+	w := sampleuv.NewWeighted(t, nil)
			
 
				+	if v, ok := w.Take(); ok {
			
 
				+		return []float64{float64(v)}, nil
			
 
				+	}
			
 
				+
			
 
				+	return t, nil
			
 
				+}
			
 
				+
			
 
				+func Sample(floats []float64, samplers ...Sampler) ([]float64, error) {
			
 
				+	var err error
			
 
				+	for _, sampler := range samplers {
			
 
				+		floats, err = sampler.Sample(floats)
			
 
				+		if err != nil {
			
 
				+			return nil, err
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return floats, nil
			
 
				+}
			
--- a/server/create.go
+++ b/server/create.go
@@ -21,8 +21,8 @@ import (
 
				 	"github.com/ollama/ollama/convert"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/llama"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				 	"github.com/ollama/ollama/template"
			
 
				 	"github.com/ollama/ollama/types/errtypes"
			
 
				 	"github.com/ollama/ollama/types/model"
			
@@ -205,7 +205,7 @@ func detectModelTypeFromFiles(files map[string]string) string {
 
				 				return ""
			
 
				 			}
			
 
				 
			
 
				-			ct := llm.DetectGGMLType(buf)
			
 
				+			ct := ggml.DetectContentType(buf)
			
 
				 			if ct == "gguf" {
			
 
				 				return "gguf"
			
 
				 			}
			
@@ -271,11 +271,11 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 
				 		return nil, err
			
 
				 	}
			
 
				 
			
 
				-	ggml, _, err := llm.DecodeGGML(bin, 0)
			
 
				+	f, _, err := ggml.Decode(bin, 0)
			
 
				 	if err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
 
				-	layers := []*layerGGML{{layer, ggml}}
			
 
				+	layers := []*layerGGML{{layer, f}}
			
 
				 
			
 
				 	if !isAdapter {
			
 
				 		return detectChatTemplate(layers)
			
@@ -283,13 +283,13 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 
				 	return layers, nil
			
 
				 }
			
 
				 
			
 
				-func kvFromLayers(baseLayers []*layerGGML) (llm.KV, error) {
			
 
				+func kvFromLayers(baseLayers []*layerGGML) (ggml.KV, error) {
			
 
				 	for _, l := range baseLayers {
			
 
				 		if l.GGML != nil {
			
 
				 			return l.KV(), nil
			
 
				 		}
			
 
				 	}
			
 
				-	return llm.KV{}, fmt.Errorf("no base model was found")
			
 
				+	return ggml.KV{}, fmt.Errorf("no base model was found")
			
 
				 }
			
 
				 
			
 
				 func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML, fn func(resp api.ProgressResponse)) (err error) {
			
@@ -306,7 +306,7 @@ func createModel(r api.CreateRequest, name model.Name, baseLayers []*layerGGML,
 
				 		if layer.GGML != nil {
			
 
				 			quantType := strings.ToUpper(cmp.Or(r.Quantize, r.Quantization))
			
 
				 			if quantType != "" && layer.GGML.Name() == "gguf" && layer.MediaType == "application/vnd.ollama.image.model" {
			
 
				-				want, err := llm.ParseFileType(quantType)
			
 
				+				want, err := ggml.ParseFileType(quantType)
			
 
				 				if err != nil {
			
 
				 					return err
			
 
				 				}
			
@@ -403,7 +403,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 
				 	ft := layer.GGML.KV().FileType()
			
 
				 	fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType)})
			
 
				 
			
 
				-	want, err := llm.ParseFileType(quantizeType)
			
 
				+	want, err := ggml.ParseFileType(quantizeType)
			
 
				 	if err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
@@ -433,13 +433,13 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 
				 		return nil, err
			
 
				 	}
			
 
				 
			
 
				-	ggml, _, err := llm.DecodeGGML(temp, 0)
			
 
				+	f, _, err := ggml.Decode(temp, 0)
			
 
				 	if err != nil {
			
 
				 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
			
 
				 		return nil, err
			
 
				 	}
			
 
				 
			
 
				-	return &layerGGML{newLayer, ggml}, nil
			
 
				+	return &layerGGML{newLayer, f}, nil
			
 
				 }
			
 
				 
			
 
				 func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML, error) {
			
@@ -475,7 +475,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
				 
			
 
				 	var offset int64
			
 
				 	for offset < stat.Size() {
			
 
				-		ggml, n, err := llm.DecodeGGML(blob, 0)
			
 
				+		f, n, err := ggml.Decode(blob, 0)
			
 
				 		if errors.Is(err, io.EOF) {
			
 
				 			break
			
 
				 		} else if err != nil {
			
@@ -483,9 +483,9 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
				 		}
			
 
				 
			
 
				 		mediatype := "application/vnd.ollama.image.model"
			
 
				-		if ggml.KV().Kind() == "adapter" {
			
 
				+		if f.KV().Kind() == "adapter" {
			
 
				 			mediatype = "application/vnd.ollama.image.adapter"
			
 
				-		} else if _, ok := ggml.KV()[fmt.Sprintf("%s.vision.block_count", ggml.KV().Architecture())]; ok || ggml.KV().Kind() == "projector" {
			
 
				+		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
			
 
				 			mediatype = "application/vnd.ollama.image.projector"
			
 
				 		}
			
 
				 
			
@@ -506,7 +506,7 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		layers = append(layers, &layerGGML{layer, ggml})
			
 
				+		layers = append(layers, &layerGGML{layer, f})
			
 
				 		offset = n
			
 
				 	}
			
 
				 
			
--- a/server/images.go
+++ b/server/images.go
@@ -23,7 +23,7 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/parser"
			
 
				 	"github.com/ollama/ollama/template"
			
 
				 	"github.com/ollama/ollama/types/model"
			
@@ -78,21 +78,21 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 
				 	for _, cap := range caps {
			
 
				 		switch cap {
			
 
				 		case CapabilityCompletion:
			
 
				-			f, err := os.Open(m.ModelPath)
			
 
				+			r, err := os.Open(m.ModelPath)
			
 
				 			if err != nil {
			
 
				 				slog.Error("couldn't open model file", "error", err)
			
 
				 				continue
			
 
				 			}
			
 
				-			defer f.Close()
			
 
				+			defer r.Close()
			
 
				 
			
 
				 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
			
 
				-			ggml, _, err := llm.DecodeGGML(f, 0)
			
 
				+			f, _, err := ggml.Decode(r, 0)
			
 
				 			if err != nil {
			
 
				 				slog.Error("couldn't decode ggml", "error", err)
			
 
				 				continue
			
 
				 			}
			
 
				 
			
 
				-			if _, ok := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]; ok {
			
 
				+			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
			
 
				 				errs = append(errs, errCapabilityCompletion)
			
 
				 			}
			
 
				 		case CapabilityTools:
			
--- a/server/model.go
+++ b/server/model.go
@@ -15,7 +15,7 @@ import (
 
				 	"text/template/parse"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/template"
			
 
				 	"github.com/ollama/ollama/types/model"
			
 
				 )
			
@@ -24,7 +24,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
 
				 
			
 
				 type layerGGML struct {
			
 
				 	Layer
			
 
				-	*llm.GGML
			
 
				+	*ggml.GGML
			
 
				 }
			
 
				 
			
 
				 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
			
@@ -64,12 +64,12 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 
				 			}
			
 
				 			defer blob.Close()
			
 
				 
			
 
				-			ggml, _, err := llm.DecodeGGML(blob, 0)
			
 
				+			f, _, err := ggml.Decode(blob, 0)
			
 
				 			if err != nil {
			
 
				 				return nil, err
			
 
				 			}
			
 
				 
			
 
				-			layers = append(layers, &layerGGML{layer, ggml})
			
 
				+			layers = append(layers, &layerGGML{layer, f})
			
 
				 		default:
			
 
				 			layers = append(layers, &layerGGML{layer, nil})
			
 
				 		}
			
@@ -118,7 +118,7 @@ func detectContentType(r io.Reader) (string, error) {
 
				 		return "", err
			
 
				 	}
			
 
				 
			
 
				-	if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
			
 
				+	if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" {
			
 
				 		return contentType, nil
			
 
				 	}
			
 
				 
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -30,6 +30,7 @@ import (
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 	"github.com/ollama/ollama/model/mllama"
			
 
				 	"github.com/ollama/ollama/openai"
			
@@ -860,7 +861,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 
				 	return resp, nil
			
 
				 }
			
 
				 
			
 
				-func getKVData(digest string, verbose bool) (llm.KV, error) {
			
 
				+func getKVData(digest string, verbose bool) (ggml.KV, error) {
			
 
				 	maxArraySize := 0
			
 
				 	if verbose {
			
 
				 		maxArraySize = -1
			
--- a/server/routes_create_test.go
+++ b/server/routes_create_test.go
@@ -19,12 +19,12 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 var stream bool = false
			
 
				 
			
 
				-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, string) {
			
 
				+func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) (string, string) {
			
 
				 	t.Helper()
			
 
				 	t.Setenv("OLLAMA_MODELS", cmp.Or(os.Getenv("OLLAMA_MODELS"), t.TempDir()))
			
 
				 
			
@@ -36,7 +36,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) (string, st
 
				 	}
			
 
				 	defer f.Close()
			
 
				 
			
 
				-	if err := llm.WriteGGUF(f, kv, ti); err != nil {
			
 
				+	if err := ggml.WriteGGUF(f, kv, ti); err != nil {
			
 
				 		t.Fatal(err)
			
 
				 	}
			
 
				 	// Calculate sha256 of file
			
@@ -672,7 +672,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 
				 	var s Server
			
 
				 
			
 
				 	t.Run("matched", func(t *testing.T) {
			
 
				-		_, digest := createBinFile(t, llm.KV{
			
 
				+		_, digest := createBinFile(t, ggml.KV{
			
 
				 			"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
			
 
				 		}, nil)
			
 
				 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
			
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -16,6 +16,7 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/discover"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -45,8 +46,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 
				 	return
			
 
				 }
			
 
				 
			
 
				-func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
			
 
				-	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
			
 
				+	return func(_ discover.GpuInfoList, _ string, _ *ggml.GGML, _, _ []string, _ api.Options, _ int) (llm.LlamaServer, error) {
			
 
				 		return mock, nil
			
 
				 	}
			
 
				 }
			
@@ -76,7 +77,7 @@ func TestGenerateChat(t *testing.T) {
 
				 			getGpuFn:      discover.GetGPUInfo,
			
 
				 			getCpuFn:      discover.GetCPUInfo,
			
 
				 			reschedDelay:  250 * time.Millisecond,
			
 
				-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
			
 
				 				// add small delay to simulate loading
			
 
				 				time.Sleep(time.Millisecond)
			
 
				 				req.successCh <- &runnerRef{
			
@@ -88,7 +89,7 @@ func TestGenerateChat(t *testing.T) {
 
				 
			
 
				 	go s.sched.Run(context.TODO())
			
 
				 
			
 
				-	_, digest := createBinFile(t, llm.KV{
			
 
				+	_, digest := createBinFile(t, ggml.KV{
			
 
				 		"general.architecture":          "llama",
			
 
				 		"llama.block_count":             uint32(1),
			
 
				 		"llama.context_length":          uint32(8192),
			
@@ -98,7 +99,7 @@ func TestGenerateChat(t *testing.T) {
 
				 		"tokenizer.ggml.tokens":         []string{""},
			
 
				 		"tokenizer.ggml.scores":         []float32{0},
			
 
				 		"tokenizer.ggml.token_type":     []int32{0},
			
 
				-	}, []llm.Tensor{
			
 
				+	}, []ggml.Tensor{
			
 
				 		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
 
				 		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
 
				 		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
@@ -154,10 +155,10 @@ func TestGenerateChat(t *testing.T) {
 
				 	})
			
 
				 
			
 
				 	t.Run("missing capabilities chat", func(t *testing.T) {
			
 
				-		_, digest := createBinFile(t, llm.KV{
			
 
				+		_, digest := createBinFile(t, ggml.KV{
			
 
				 			"general.architecture": "bert",
			
 
				 			"bert.pooling_type":    uint32(0),
			
 
				-		}, []llm.Tensor{})
			
 
				+		}, []ggml.Tensor{})
			
 
				 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
			
 
				 			Model:  "bert",
			
 
				 			Files:  map[string]string{"bert.gguf": digest},
			
@@ -612,7 +613,7 @@ func TestGenerate(t *testing.T) {
 
				 			getGpuFn:      discover.GetGPUInfo,
			
 
				 			getCpuFn:      discover.GetCPUInfo,
			
 
				 			reschedDelay:  250 * time.Millisecond,
			
 
				-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				+			loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) {
			
 
				 				// add small delay to simulate loading
			
 
				 				time.Sleep(time.Millisecond)
			
 
				 				req.successCh <- &runnerRef{
			
@@ -624,7 +625,7 @@ func TestGenerate(t *testing.T) {
 
				 
			
 
				 	go s.sched.Run(context.TODO())
			
 
				 
			
 
				-	_, digest := createBinFile(t, llm.KV{
			
 
				+	_, digest := createBinFile(t, ggml.KV{
			
 
				 		"general.architecture":          "llama",
			
 
				 		"llama.block_count":             uint32(1),
			
 
				 		"llama.context_length":          uint32(8192),
			
@@ -634,7 +635,7 @@ func TestGenerate(t *testing.T) {
 
				 		"tokenizer.ggml.tokens":         []string{""},
			
 
				 		"tokenizer.ggml.scores":         []float32{0},
			
 
				 		"tokenizer.ggml.token_type":     []int32{0},
			
 
				-	}, []llm.Tensor{
			
 
				+	}, []ggml.Tensor{
			
 
				 		{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
 
				 		{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
 
				 		{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
			
@@ -686,10 +687,10 @@ func TestGenerate(t *testing.T) {
 
				 	})
			
 
				 
			
 
				 	t.Run("missing capabilities generate", func(t *testing.T) {
			
 
				-		_, digest := createBinFile(t, llm.KV{
			
 
				+		_, digest := createBinFile(t, ggml.KV{
			
 
				 			"general.architecture": "bert",
			
 
				 			"bert.pooling_type":    uint32(0),
			
 
				-		}, []llm.Tensor{})
			
 
				+		}, []ggml.Tensor{})
			
 
				 
			
 
				 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
			
 
				 			Model:  "bert",
			
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -21,7 +21,7 @@ import (
 
				 	"unicode"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/openai"
			
 
				 	"github.com/ollama/ollama/types/model"
			
 
				 	"github.com/ollama/ollama/version"
			
@@ -654,8 +654,8 @@ func TestShow(t *testing.T) {
 
				 
			
 
				 	var s Server
			
 
				 
			
 
				-	_, digest1 := createBinFile(t, llm.KV{"general.architecture": "test"}, nil)
			
 
				-	_, digest2 := createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
			
 
				+	_, digest1 := createBinFile(t, ggml.KV{"general.architecture": "test"}, nil)
			
 
				+	_, digest2 := createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil)
			
 
				 
			
 
				 	createRequest(t, s.CreateHandler, api.CreateRequest{
			
 
				 		Name:  "show-model",
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -18,6 +18,7 @@ import (
 
				 	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -41,8 +42,8 @@ type Scheduler struct {
 
				 	loaded   map[string]*runnerRef
			
 
				 	loadedMu sync.Mutex
			
 
				 
			
 
				-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
			
 
				-	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
			
 
				+	loadFn       func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
			
 
				+	newServerFn  func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
			
 
				 	getGpuFn     func() discover.GpuInfoList
			
 
				 	getCpuFn     func() discover.GpuInfoList
			
 
				 	reschedDelay time.Duration
			
@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 
				 	}()
			
 
				 }
			
 
				 
			
 
				-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				+func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				 	if numParallel < 1 {
			
 
				 		numParallel = 1
			
 
				 	}
			
@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
 
				 	if req.sessionDuration != nil {
			
 
				 		sessionDuration = req.sessionDuration.Duration
			
 
				 	}
			
 
				-	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
			
 
				+	llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
			
 
				 	if err != nil {
			
 
				 		// some older models are not compatible with newer versions of llama.cpp
			
 
				 		// show a generalized compatibility error until there is a better way to
			
 
				 		// check for model compatibility
			
 
				-		if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
			
 
				+		if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
			
 
				 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
			
 
				 		}
			
 
				 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
			
@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
 
				 // If the model can not be fit fully within the available GPU(s) nil is returned
			
 
				 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
			
 
				 // opts.NumCtx accordingly
			
 
				-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				+func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				 	var estimatedVRAM uint64
			
 
				 
			
 
				 	var numParallelToTry []int
			
@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 
				 			req.opts.NumCtx = req.origNumCtx * p
			
 
				 			if !envconfig.SchedSpread() {
			
 
				 				for _, g := range sgl {
			
 
				-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				 						*numParallel = p
			
 
				 						return []discover.GpuInfo{g}
			
@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 
				 		// Now try all the GPUs
			
 
				 		for _, p := range numParallelToTry {
			
 
				 			req.opts.NumCtx = req.origNumCtx * p
			
 
				-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
			
 
				 				*numParallel = p
			
 
				 				return sgl
			
@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 
				 }
			
 
				 
			
 
				 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
			
 
				-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				+func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				 	if *numParallel <= 0 {
			
 
				 		*numParallel = 1
			
 
				 		req.opts.NumCtx = req.origNumCtx
			
@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
 
				 	var bestEstimate uint64
			
 
				 	var bestFit int
			
 
				 	for i, gl := range byLibrary {
			
 
				-		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
			
 
				+		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
			
 
				 		if estimatedVRAM > bestEstimate {
			
 
				 			bestEstimate = estimatedVRAM
			
 
				 			bestFit = i
			
@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
 
				 
			
 
				 // If other runners are loaded, make sure the pending request will fit in system memory
			
 
				 // If not, pick a runner to unload, else return nil and the request can be loaded
			
 
				-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
			
 
				+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
			
 
				 	slog.Debug("evaluating if CPU model load will fit in available system memory")
			
 
				-	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
			
 
				+	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
			
 
				 	if estimate.TotalSize <= gpus[0].FreeMemory {
			
 
				 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
			
 
				 		return nil
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -15,6 +15,7 @@ import (
 
				 	"github.com/ollama/ollama/app/lifecycle"
			
 
				 	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
 
				 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
			
 
				 	defer done()
			
 
				 	s := InitScheduler(ctx)
			
 
				-	var ggml *llm.GGML // value not used in tests
			
 
				+	var f *ggml.GGML // value not used in tests
			
 
				 	req := &LlmRequest{
			
 
				 		ctx:             ctx,
			
 
				 		model:           &Model{ModelPath: "foo"},
			
@@ -47,11 +48,11 @@ func TestLoad(t *testing.T) {
 
				 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
			
 
				 	}
			
 
				 	// Fail to load model first
			
 
				-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return nil, errors.New("something failed to load model blah")
			
 
				 	}
			
 
				 	gpus := discover.GpuInfoList{}
			
 
				-	s.load(req, ggml, gpus, 0)
			
 
				+	s.load(req, f, gpus, 0)
			
 
				 	require.Empty(t, req.successCh)
			
 
				 	require.Len(t, req.errCh, 1)
			
 
				 	s.loadedMu.Lock()
			
@@ -61,10 +62,10 @@ func TestLoad(t *testing.T) {
 
				 	require.Contains(t, err.Error(), "this model may be incompatible")
			
 
				 
			
 
				 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return server, nil
			
 
				 	}
			
 
				-	s.load(req, ggml, gpus, 0)
			
 
				+	s.load(req, f, gpus, 0)
			
 
				 	select {
			
 
				 	case err := <-req.errCh:
			
 
				 		require.NoError(t, err)
			
@@ -78,7 +79,7 @@ func TestLoad(t *testing.T) {
 
				 
			
 
				 	req.model.ModelPath = "dummy_model_path"
			
 
				 	server.waitResp = errors.New("wait failure")
			
 
				-	s.load(req, ggml, gpus, 0)
			
 
				+	s.load(req, f, gpus, 0)
			
 
				 	select {
			
 
				 	case err := <-req.errCh:
			
 
				 		require.Contains(t, err.Error(), "wait failure")
			
@@ -99,10 +100,10 @@ type reqBundle struct {
 
				 	ctxDone func()
			
 
				 	srv     *mockLlm
			
 
				 	req     *LlmRequest
			
 
				-	ggml    *llm.GGML
			
 
				+	f       *ggml.GGML
			
 
				 }
			
 
				 
			
 
				-func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 	return scenario.srv, nil
			
 
				 }
			
 
				 
			
@@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
				 	require.NoError(t, err)
			
 
				 	defer f.Close()
			
 
				 
			
 
				-	require.NoError(t, llm.WriteGGUF(f, llm.KV{
			
 
				+	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
			
 
				 		"general.architecture":          "llama",
			
 
				 		"llama.context_length":          uint32(32),
			
 
				 		"llama.embedding_length":        uint32(4096),
			
@@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
				 		"tokenizer.ggml.tokens":         []string{" "},
			
 
				 		"tokenizer.ggml.scores":         []float32{0},
			
 
				 		"tokenizer.ggml.token_type":     []int32{0},
			
 
				-	}, []llm.Tensor{
			
 
				+	}, []ggml.Tensor{
			
 
				 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
 
				 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
			
 
				 	}))
			
@@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
				 
			
 
				 	fname := f.Name()
			
 
				 	model := &Model{Name: modelName, ModelPath: fname}
			
 
				-	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
			
 
				+	b.f, err = llm.LoadModel(model.ModelPath, 0)
			
 
				 	require.NoError(t, err)
			
 
				 
			
 
				 	if duration == nil {
			
@@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
 
				 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
			
 
				 	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
			
 
				 	b.req.model = a.req.model
			
 
				-	b.ggml = a.ggml
			
 
				+	b.f = a.f
			
 
				 
			
 
				 	s.newServerFn = a.newServer
			
 
				 	slog.Info("a")
			
@@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
 
				 	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
			
 
				 	tmpModel := *a.req.model
			
 
				 	b.req.model = &tmpModel
			
 
				-	b.ggml = a.ggml
			
 
				+	b.f = a.f
			
 
				 
			
 
				 	s.newServerFn = a.newServer
			
 
				 	slog.Info("a")
			
@@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) {
 
				 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
			
 
				 	}
			
 
				 
			
 
				-	var ggml *llm.GGML
			
 
				+	var f *ggml.GGML
			
 
				 	gpus := discover.GpuInfoList{}
			
 
				 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return server, nil
			
 
				 	}
			
 
				-	s.load(req, ggml, gpus, 0)
			
 
				+	s.load(req, f, gpus, 0)
			
 
				 
			
 
				 	select {
			
 
				 	case err := <-req.errCh:
			
@@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 
				 	}
			
 
				 	s.getCpuFn = getCpuFn
			
 
				 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
			
 
				-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		require.Len(t, gpus, 1)
			
 
				-		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
			
 
				+		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
			
 
				 	}
			
 
				 	slog.Info("a")
			
 
				 	s.pendingReqCh <- a.req
			
--- a/template/template_test.go
+++ b/template/template_test.go
@@ -14,7 +14,7 @@ import (
 
				 	"github.com/google/go-cmp/cmp"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 func TestNamed(t *testing.T) {
			
@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
 
				 
			
 
				 		for k, v := range ss {
			
 
				 			t.Run(k, func(t *testing.T) {
			
 
				-				kv := llm.KV{"tokenizer.chat_template": v}
			
 
				+				kv := ggml.KV{"tokenizer.chat_template": v}
			
 
				 				s := kv.ChatTemplate()
			
 
				 				r, err := Named(s)
			
 
				 				if err != nil {
		`@@ -0,0 +1 @@`
		`+/Users/michaelyang/git/ollama/library/nltpt/Llama-3.2-11B-Vision-Instruct/merged.gguf`