浏览代码

merge ggml file decoding

Michael Yang 5 月之前
父节点
当前提交
b7943d941d

+ 16 - 16
convert/convert.go

@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"log/slog"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type ModelParameters struct {
 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 	} `json:"lora_parameters"`
 }
 }
 
 
-func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := llm.KV{
+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
+	kv := ggml.KV{
 		"general.file_type":            uint32(1),
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p AdapterParameters) KV() llm.KV {
+func (p AdapterParameters) KV() ggml.KV {
 	var alpha float32
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 		alpha = p.LoraParameters.Alpha
 		alpha = p.LoraParameters.Alpha
 	}
 	}
 
 
-	kv := llm.KV{
+	kv := ggml.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 	}
 }
 }
 
 
-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }
 }
 
 
-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
+	return ggml.WriteGGUF(ws, kv, ts)
 }
 }
 
 
 type ModelConverter interface {
 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) llm.KV
+	KV(*Tokenizer) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }
 }
 
 
 type moreParser interface {
 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {
 
 
 type AdapterConverter interface {
 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
 	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
+	KV(ggml.KV) ggml.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []ggml.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	Replacements() []string
 
 
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
 }
 }
 
 
-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 	if err != nil {
 		return err
 		return err

+ 5 - 5
convert/convert_bert.go

@@ -8,7 +8,7 @@ import (
 	"slices"
 	"slices"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type bertModel struct {
 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 	return nil
 }
 }
 
 
-func (p *bertModel) KV(t *Tokenizer) llm.KV {
+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		if slices.Contains([]string{
 		if slices.Contains([]string{
 			"embeddings.position_ids",
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 			continue
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_gemma.go

@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type gemmaModel struct {
 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {
 
 
 var _ ModelConverter = (*gemmaModel)(nil)
 var _ ModelConverter = (*gemmaModel)(nil)
 
 
-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 			t.SetRepacker(p.addOne)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 2 - 4
convert/convert_gemma2.go

@@ -1,8 +1,6 @@
 package convert
 package convert
 
 
-import (
-	"github.com/ollama/ollama/llm"
-)
+import "github.com/ollama/ollama/fs/ggml"
 
 
 type gemma2Model struct {
 type gemma2Model struct {
 	gemmaModel
 	gemmaModel
@@ -11,7 +9,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }
 }
 
 
-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings

+ 5 - 5
convert/convert_gemma2_adapter.go

@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type gemma2Adapter struct {
 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
 
 
 var _ AdapterConverter = (*gemma2Adapter)(nil)
 var _ AdapterConverter = (*gemma2Adapter)(nil)
 
 
-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	kv["general.architecture"] = "gemma2"
 	return kv
 	return kv
 }
 }
 
 
-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		shape := t.Shape()
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 6 - 6
convert/convert_llama.go

@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type llamaModel struct {
 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {
 
 
 var _ ModelConverter = (*llamaModel)(nil)
 var _ ModelConverter = (*llamaModel)(nil)
 
 
-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 
 
 	if p.RopeScaling.factors != nil {
 	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     "rope_freqs.weight",
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_llama_adapter.go

@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type llamaAdapter struct {
 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {
 
 
 var _ AdapterConverter = (*llamaAdapter)(nil)
 var _ AdapterConverter = (*llamaAdapter)(nil)
 
 
-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
 	kv := p.AdapterParameters.KV()
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
+	var out []ggml.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		shape := t.Shape()
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    shape,
 			Shape:    shape,

+ 5 - 5
convert/convert_mixtral.go

@@ -6,7 +6,7 @@ import (
 	"slices"
 	"slices"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type mixtralModel struct {
 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
 }
 
 
-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
 	kv := p.llamaModel.KV(t)
 	kv := p.llamaModel.KV(t)
 
 
 	if p.NumLocalExperts > 0 {
 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
 	oldnew := []string{
 	oldnew := []string{
 		"model.layers", "blk",
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		return true
 		return true
 	})
 	})
 
 
-	var out []llm.Tensor
+	var out []ggml.Tensor
 	for n, e := range experts {
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
 		// TODO(mxyng): sanity check experts
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     n,
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),

+ 7 - 7
convert/convert_phi3.go

@@ -8,7 +8,7 @@ import (
 	"strings"
 	"strings"
 	"sync"
 	"sync"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type phi3Model struct {
 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {
 
 
 var _ ModelConverter = (*phi3Model)(nil)
 var _ ModelConverter = (*phi3Model)(nil)
 
 
-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
 	var addRopeFactors sync.Once
 	var addRopeFactors sync.Once
 
 
-	out := make([]llm.Tensor, 0, len(ts)+2)
+	out := make([]ggml.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
+				out = append(out, ggml.Tensor{
 					Name:     "rope_factors_long.weight",
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
 					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
+				}, ggml.Tensor{
 					Name:     "rope_factors_short.weight",
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 			})
 			})
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, ggml.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_test.go

@@ -20,7 +20,7 @@ import (
 
 
 	"golang.org/x/exp/maps"
 	"golang.org/x/exp/maps"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 type tensorData struct {
 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 	Shape   []int  `json:"shape"`
 }
 }
 
 
-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	t.Helper()
 	t.Helper()
 
 
 	f, err := os.CreateTemp(t.TempDir(), "f16")
 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	}
 	}
 	t.Cleanup(func() { r.Close() })
 	t.Cleanup(func() { r.Close() })
 
 
-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
+	m, _, err := ggml.Decode(r, math.MaxInt)
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 	return r, m.KV(), m.Tensors()
 }
 }
 
 
-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
 	actual := make(map[string]string)
 	actual := make(map[string]string)
 	for k, v := range kv {
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			}
 			defer r.Close()
 			defer r.Close()
 
 
-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			m, _, err := ggml.Decode(r, math.MaxInt)
 			if err != nil {
 			if err != nil {
 				t.Fatal(err)
 				t.Fatal(err)
 			}
 			}

+ 258 - 7
fs/ggml/ggml.go

@@ -1,12 +1,12 @@
 package ggml
 package ggml
 
 
 import (
 import (
-	"cmp"
 	"encoding/binary"
 	"encoding/binary"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"log/slog"
 	"log/slog"
+	"slices"
 	"strings"
 	"strings"
 
 
 	"github.com/ollama/ollama/fs/util/bufioutil"
 	"github.com/ollama/ollama/fs/util/bufioutil"
@@ -25,7 +25,15 @@ type model interface {
 type KV map[string]any
 type KV map[string]any
 
 
 func (kv KV) Architecture() string {
 func (kv KV) Architecture() string {
-	return cmp.Or(kv.String("general.architecture"), "unknown")
+	return kv.String("general.architecture", "unknown")
+}
+
+func (kv KV) Kind() string {
+	return kv.String("general.kind", "unknown")
+}
+
+func (kv KV) ParameterCount() uint64 {
+	return keyValue[uint64](kv, "general.parameter_count")
 }
 }
 
 
 func (kv KV) FileType() fileType {
 func (kv KV) FileType() fileType {
@@ -36,6 +44,50 @@ func (kv KV) FileType() fileType {
 	return fileTypeUnknown
 	return fileTypeUnknown
 }
 }
 
 
+func (kv KV) BlockCount() uint64 {
+	return uint64(kv.Uint("block_count"))
+}
+
+func (kv KV) EmbeddingLength() uint64 {
+	return uint64(kv.Uint("embedding_length"))
+}
+
+func (kv KV) HeadCount() uint64 {
+	return uint64(kv.Uint("attention.head_count"))
+}
+
+func (kv KV) HeadCountKV() uint64 {
+	return uint64(kv.Uint("attention.head_count_kv", 1))
+}
+
+func (kv KV) EmbeddingHeadCount() uint64 {
+	if heads := kv.HeadCount(); heads > 0 {
+		return kv.EmbeddingLength() / heads
+	}
+
+	return 0
+}
+
+func (kv KV) EmbeddingHeadCountK() uint64 {
+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
+}
+
+func (kv KV) EmbeddingHeadCountV() uint64 {
+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
+}
+
+func (kv KV) GQA() uint64 {
+	return kv.HeadCount() / kv.HeadCountKV()
+}
+
+func (kv KV) ContextLength() uint64 {
+	return uint64(kv.Uint("context_length"))
+}
+
+func (kv KV) ChatTemplate() string {
+	return kv.String("tokenizer.chat_template")
+}
+
 func (kv KV) String(key string, defaultValue ...string) string {
 func (kv KV) String(key string, defaultValue ...string) string {
 	return keyValue(kv, key, append(defaultValue, "")...)
 	return keyValue(kv, key, append(defaultValue, "")...)
 }
 }
@@ -68,7 +120,7 @@ func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
 	return s
 	return s
 }
 }
 
 
-func keyValue[T string | uint32 | float32 | *array](kv KV, key string, defaultValue ...T) T {
+func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
 		key = kv.Architecture() + "." + key
 		key = kv.Architecture() + "." + key
 	}
 	}
@@ -107,7 +159,7 @@ func (ts Tensors) Layers() map[string]Layer {
 
 
 type Layer map[string]*Tensor
 type Layer map[string]*Tensor
 
 
-func (l Layer) size() (size uint64) {
+func (l Layer) Size() (size uint64) {
 	for _, t := range l {
 	for _, t := range l {
 		size += t.Size()
 		size += t.Size()
 	}
 	}
@@ -243,7 +295,7 @@ const (
 
 
 var ErrUnsupportedFormat = errors.New("unsupported model format")
 var ErrUnsupportedFormat = errors.New("unsupported model format")
 
 
-func DetectGGMLType(b []byte) string {
+func DetectContentType(b []byte) string {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	switch binary.LittleEndian.Uint32(b[:4]) {
 	case FILE_MAGIC_GGML:
 	case FILE_MAGIC_GGML:
 		return "ggml"
 		return "ggml"
@@ -260,12 +312,12 @@ func DetectGGMLType(b []byte) string {
 	}
 	}
 }
 }
 
 
-// DecodeGGML decodes a GGML model from the given reader.
+// Decode decodes a GGML model from the given reader.
 //
 //
 // It collects array values for arrays with a size less than or equal to
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
 // the maxArraySize is negative, all arrays are collected.
-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	if maxArraySize == 0 {
 	if maxArraySize == 0 {
 		maxArraySize = 1024
 		maxArraySize = 1024
 	}
 	}
@@ -303,3 +355,202 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 		model:     model,
 		model:     model,
 	}, offset, nil
 	}, offset, nil
 }
 }
+
+func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+	embedding := llm.KV().EmbeddingLength()
+	heads := llm.KV().HeadCount()
+	headsKV := llm.KV().HeadCountKV()
+	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
+
+	embeddingHeads := llm.KV().EmbeddingHeadCount()
+	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
+	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
+
+	layers := llm.Tensors().Layers()
+
+	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
+	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+
+	switch llm.KV().Architecture() {
+	case "llama":
+		fullOffload = max(
+			4*batch*(1+4*embedding+context*(1+heads)),
+			4*batch*(embedding+vocab),
+		)
+
+		partialOffload = 4 * batch * embedding
+		partialOffload += max(
+			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
+			4*batch*(embedding+vocab)+embedding*vocab*105/128,
+		)
+
+		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
+			// mixtral 8x22b
+			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
+			partialOffload = max(
+				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
+				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
+			)
+		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
+			// mixtral 8x7b
+			ffnGateWeight1 := ffnGateWeight.Shape[1]
+			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
+			partialOffload = max(
+				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
+				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
+			)
+		}
+	case "mllama":
+		var visionTokens, tiles uint64 = 1601, 4
+
+		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
+			kv = headsKV *
+				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
+				(2* // sizeof(float16)
+					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
+					context +
+					4* // sizeof(float32)
+						uint64(crossAttentionLayers.size)* // num cross attention layers
+						visionTokens*
+						tiles)
+		}
+
+		fullOffload = max(
+			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
+			// vocab graph
+			4*batch*(embedding+vocab),
+		)
+
+		var ropeFreqsCount uint64
+		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
+			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
+				ropeFreqsCount = ropeFreqsWeights.parameters()
+			}
+		}
+
+		partialOffload = max(
+			4*(batch*
+				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
+				ropeFreqsCount+
+				embeddingHeadsK*context*headsKV),
+			// vocab graph
+			4*batch*(embedding+vocab)+embedding*vocab*105/128,
+		)
+	case "gemma", "gemma2":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
+		)
+
+		partialOffload = max(
+			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
+				4*embeddingHeadsK*context*8+
+				embedding*embeddingHeadsK*heads*9/16,
+		)
+	case "command-r":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(2+4*embedding+context*(1+heads)),
+		)
+
+		partialOffload = max(
+			4*batch*(embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
+		)
+	case "qwen2":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(1+2*embedding+context+context*heads),
+		)
+
+		partialOffload = max(
+			4*batch*(embedding+vocab)+embedding*vocab*105/128,
+			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
+		)
+	case "phi2":
+		fullOffload = max(
+			4*batch*(embedding+vocab),
+			4*batch*(1+4*embedding+context+context*heads),
+		)
+
+		partialOffload = max(
+			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(2+3*embedding+context+context*heads),
+		)
+	case "stablelm":
+		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
+		partialOffload = max(
+			4*batch*(vocab+2*embedding),
+			fullOffload,
+		)
+	case "deepseek2":
+		fullOffload = max(
+			4*batch*(3*embedding+vocab),
+			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
+		)
+
+		partialOffload = max(
+			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
+			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
+		)
+	case "chatglm":
+		fullOffload = 4 * batch * (embedding + vocab)
+		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
+		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
+			fullOffload = max(
+				fullOffload,
+				4*batch*(2+
+					2*embedding+
+					context+
+					context*heads+
+					embeddingHeadsK*heads+
+					qkvBias.Shape[0]),
+			)
+
+			partialOffload = max(
+				partialOffload,
+				4*batch*(1+
+					2*embedding+
+					embeddingHeadsK*heads+
+					context+
+					context*heads)+
+					4*embeddingHeadsK*context+
+					4*context*embeddingHeadsK+
+					4*qkvBias.Shape[0],
+			)
+		}
+	}
+
+	return
+}
+
+// SupportsKVCacheType checks if the requested cache type is supported
+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
+}
+
+// SupportsFlashAttention checks if the model supports flash attention
+func (llm GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
+	if isEmbedding {
+		return false
+	}
+
+	// Check head counts match and are non-zero
+	headCountK := llm.KV().EmbeddingHeadCountK()
+	headCountV := llm.KV().EmbeddingHeadCountV()
+	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
+}
+
+// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
+func kvCacheBytesPerElement(cacheType string) float64 {
+	switch cacheType {
+	case "q8_0":
+		return 1 // 1/2 of fp16
+	case "q4_0":
+		return 0.5 // 1/4 of fp16
+	default:
+		return 2 // f16 (default)
+	}
+}

+ 0 - 185
llm/filetype.go

@@ -1,185 +0,0 @@
-package llm
-
-import "fmt"
-
-type fileType uint32
-
-const (
-	fileTypeF32 fileType = iota
-	fileTypeF16
-	fileTypeQ4_0
-	fileTypeQ4_1
-	fileTypeQ4_1_F16
-	fileTypeQ4_2 // unused
-	fileTypeQ4_3 // unused
-	fileTypeQ8_0
-	fileTypeQ5_0
-	fileTypeQ5_1
-	fileTypeQ2_K
-	fileTypeQ3_K_S
-	fileTypeQ3_K_M
-	fileTypeQ3_K_L
-	fileTypeQ4_K_S
-	fileTypeQ4_K_M
-	fileTypeQ5_K_S
-	fileTypeQ5_K_M
-	fileTypeQ6_K
-	fileTypeIQ2_XXS
-	fileTypeIQ2_XS
-	fileTypeQ2_K_S
-	fileTypeIQ3_XS
-	fileTypeIQ3_XXS
-	fileTypeIQ1_S
-	fileTypeIQ4_NL
-	fileTypeIQ3_S
-	fileTypeIQ3_M
-	fileTypeIQ2_S
-	fileTypeIQ2_M
-	fileTypeIQ4_XS
-	fileTypeIQ1_M
-	fileTypeBF16
-
-	fileTypeUnknown
-)
-
-func ParseFileType(s string) (fileType, error) {
-	switch s {
-	case "F32":
-		return fileTypeF32, nil
-	case "F16":
-		return fileTypeF16, nil
-	case "Q4_0":
-		return fileTypeQ4_0, nil
-	case "Q4_1":
-		return fileTypeQ4_1, nil
-	case "Q4_1_F16":
-		return fileTypeQ4_1_F16, nil
-	case "Q8_0":
-		return fileTypeQ8_0, nil
-	case "Q5_0":
-		return fileTypeQ5_0, nil
-	case "Q5_1":
-		return fileTypeQ5_1, nil
-	case "Q2_K":
-		return fileTypeQ2_K, nil
-	case "Q3_K_S":
-		return fileTypeQ3_K_S, nil
-	case "Q3_K_M":
-		return fileTypeQ3_K_M, nil
-	case "Q3_K_L":
-		return fileTypeQ3_K_L, nil
-	case "Q4_K_S":
-		return fileTypeQ4_K_S, nil
-	case "Q4_K_M":
-		return fileTypeQ4_K_M, nil
-	case "Q5_K_S":
-		return fileTypeQ5_K_S, nil
-	case "Q5_K_M":
-		return fileTypeQ5_K_M, nil
-	case "Q6_K":
-		return fileTypeQ6_K, nil
-	case "IQ2_XXS":
-		return fileTypeIQ2_XXS, nil
-	case "IQ2_XS":
-		return fileTypeIQ2_XS, nil
-	case "Q2_K_S":
-		return fileTypeQ2_K_S, nil
-	case "IQ3_XS":
-		return fileTypeIQ3_XS, nil
-	case "IQ3_XXS":
-		return fileTypeIQ3_XXS, nil
-	case "IQ1_S":
-		return fileTypeIQ1_S, nil
-	case "IQ4_NL":
-		return fileTypeIQ4_NL, nil
-	case "IQ3_S":
-		return fileTypeIQ3_S, nil
-	case "IQ3_M":
-		return fileTypeIQ3_M, nil
-	case "IQ2_S":
-		return fileTypeIQ2_S, nil
-	case "IQ4_XS":
-		return fileTypeIQ4_XS, nil
-	case "IQ2_M":
-		return fileTypeIQ2_M, nil
-	case "IQ1_M":
-		return fileTypeIQ1_M, nil
-	case "BF16":
-		return fileTypeBF16, nil
-	default:
-		return fileTypeUnknown, fmt.Errorf("unknown fileType: %s", s)
-	}
-}
-
-func (t fileType) String() string {
-	switch t {
-	case fileTypeF32:
-		return "F32"
-	case fileTypeF16:
-		return "F16"
-	case fileTypeQ4_0:
-		return "Q4_0"
-	case fileTypeQ4_1:
-		return "Q4_1"
-	case fileTypeQ4_1_F16:
-		return "Q4_1_F16"
-	case fileTypeQ8_0:
-		return "Q8_0"
-	case fileTypeQ5_0:
-		return "Q5_0"
-	case fileTypeQ5_1:
-		return "Q5_1"
-	case fileTypeQ2_K:
-		return "Q2_K"
-	case fileTypeQ3_K_S:
-		return "Q3_K_S"
-	case fileTypeQ3_K_M:
-		return "Q3_K_M"
-	case fileTypeQ3_K_L:
-		return "Q3_K_L"
-	case fileTypeQ4_K_S:
-		return "Q4_K_S"
-	case fileTypeQ4_K_M:
-		return "Q4_K_M"
-	case fileTypeQ5_K_S:
-		return "Q5_K_S"
-	case fileTypeQ5_K_M:
-		return "Q5_K_M"
-	case fileTypeQ6_K:
-		return "Q6_K"
-	case fileTypeIQ2_XXS:
-		return "IQ2_XXS"
-	case fileTypeIQ2_XS:
-		return "IQ2_XS"
-	case fileTypeQ2_K_S:
-		return "Q2_K_S"
-	case fileTypeIQ3_XS:
-		return "IQ3_XS"
-	case fileTypeIQ3_XXS:
-		return "IQ3_XXS"
-	case fileTypeIQ1_S:
-		return "IQ1_S"
-	case fileTypeIQ4_NL:
-		return "IQ4_NL"
-	case fileTypeIQ3_S:
-		return "IQ3_S"
-	case fileTypeIQ3_M:
-		return "IQ3_M"
-	case fileTypeIQ2_S:
-		return "IQ2_S"
-	case fileTypeIQ4_XS:
-		return "IQ4_XS"
-	case fileTypeIQ2_M:
-		return "IQ2_M"
-	case fileTypeIQ1_M:
-		return "IQ1_M"
-	case fileTypeBF16:
-		return "BF16"
-	default:
-		return "unknown"
-	}
-}
-
-func (t fileType) Value() uint32 {
-	return uint32(t)
-}

+ 0 - 149
llm/ggla.go

@@ -1,149 +0,0 @@
-package llm
-
-import (
-	"encoding/binary"
-	"errors"
-	"io"
-	"slices"
-)
-
-type containerGGLA struct {
-	version uint32
-}
-
-func (c *containerGGLA) Name() string {
-	return "ggla"
-}
-
-func (c *containerGGLA) Decode(rs io.ReadSeeker) (model, error) {
-	if err := binary.Read(rs, binary.LittleEndian, &c.version); err != nil {
-		return nil, err
-	}
-
-	switch c.version {
-	case 1:
-	default:
-		return nil, errors.New("invalid version")
-	}
-
-	model := newGGLA(c)
-	err := model.decode(rs)
-	return model, err
-}
-
-type ggla struct {
-	*containerGGLA
-
-	kv      KV
-	tensors []*Tensor
-
-	tensorOffset uint64
-}
-
-func newGGLA(container *containerGGLA) *ggla {
-	return &ggla{
-		containerGGLA: container,
-		kv:            make(KV),
-	}
-}
-
-func (llm *ggla) KV() KV {
-	return llm.kv
-}
-
-func (llm *ggla) Tensors() *Tensors {
-	return &Tensors{
-		Items:  llm.tensors,
-		Offset: llm.tensorOffset,
-	}
-}
-
-func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
-	var r uint32
-	if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
-		return err
-	}
-	llm.kv["r"] = r
-
-	var alpha uint32
-	if err := binary.Read(rs, binary.LittleEndian, &alpha); err != nil {
-		return err
-	}
-	llm.kv["alpha"] = alpha
-
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	llm.tensorOffset = uint64(offset)
-
-	for {
-		var dims uint32
-		if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
-			if errors.Is(err, io.EOF) {
-				return nil
-			}
-			return err
-		}
-
-		defer func() {
-			if errors.Is(retErr, io.EOF) {
-				retErr = io.ErrUnexpectedEOF
-			}
-		}()
-
-		var namesize uint32
-		if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
-			return err
-		}
-
-		var t Tensor
-		if err := binary.Read(rs, binary.LittleEndian, &t.Kind); err != nil {
-			return err
-		}
-
-		t.Shape = make([]uint64, dims)
-		for i := 0; uint32(i) < dims; i++ {
-			var shape32 uint32
-			if err := binary.Read(rs, binary.LittleEndian, &shape32); err != nil {
-				return err
-			}
-
-			t.Shape[i] = uint64(shape32)
-		}
-
-		// ggla tensor shape is reversed
-		// ref: https://github.com/ggerganov/llama.cpp/blob/29ae62d2ae163e2b68aa0ad3bf2ab4636de0c957/convert-lora-to-ggml.py#L44
-		slices.Reverse(t.Shape)
-
-		name := make([]byte, namesize)
-		if err := binary.Read(rs, binary.LittleEndian, &name); err != nil {
-			return err
-		}
-
-		t.Name = string(name)
-
-		offset, err := rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
-			return err
-		}
-
-		offset, err = rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return err
-		}
-
-		t.Offset = uint64(offset)
-
-		if _, err := rs.Seek(int64(t.Size()), io.SeekCurrent); err != nil {
-			return err
-		}
-
-		llm.tensors = append(llm.tensors, &t)
-	}
-}

+ 0 - 561
llm/ggml.go

@@ -1,561 +0,0 @@
-package llm
-
-import (
-	"encoding/binary"
-	"errors"
-	"fmt"
-	"io"
-	"slices"
-	"strings"
-	"sync"
-
-	"github.com/ollama/ollama/fs/util/bufioutil"
-)
-
-type GGML struct {
-	container
-	model
-}
-
-type model interface {
-	KV() KV
-	Tensors() *Tensors
-}
-
-type KV map[string]any
-
-func (kv KV) u64(key string) uint64 {
-	switch v := kv[key].(type) {
-	case uint64:
-		return v
-	case uint32:
-		return uint64(v)
-	case float64:
-		return uint64(v)
-	default:
-		return 0
-	}
-}
-
-func (kv KV) Architecture() string {
-	if s, ok := kv["general.architecture"].(string); ok {
-		return s
-	}
-
-	return "unknown"
-}
-
-func (kv KV) Kind() string {
-	if s, ok := kv["general.type"].(string); ok {
-		return s
-	}
-
-	return "unknown"
-}
-
-func (kv KV) ParameterCount() uint64 {
-	return kv.u64("general.parameter_count")
-}
-
-func (kv KV) FileType() fileType {
-	if u64 := kv.u64("general.file_type"); u64 > 0 {
-		return fileType(uint32(u64))
-	}
-
-	return fileTypeUnknown
-}
-
-func (kv KV) BlockCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
-}
-
-func (kv KV) HeadCount() uint64 {
-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
-}
-
-func (kv KV) HeadCountKV() uint64 {
-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
-		return headCountKV
-	}
-
-	return 1
-}
-
-func (kv KV) EmbeddingHeadCount() uint64 {
-	if heads := kv.HeadCount(); heads > 0 {
-		return kv.EmbeddingLength() / kv.HeadCount()
-	}
-
-	return 0
-}
-
-func (kv KV) EmbeddingHeadCountK() uint64 {
-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
-		return k
-	}
-
-	return kv.EmbeddingHeadCount()
-}
-
-func (kv KV) EmbeddingHeadCountV() uint64 {
-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
-		return v
-	}
-
-	return kv.EmbeddingHeadCount()
-}
-
-func (kv KV) GQA() uint64 {
-	return kv.HeadCount() / kv.HeadCountKV()
-}
-
-func (kv KV) EmbeddingLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
-}
-
-func (kv KV) ContextLength() uint64 {
-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
-}
-
-func (kv KV) ChatTemplate() string {
-	s, _ := kv["tokenizer.chat_template"].(string)
-	return s
-}
-
-type Tensors struct {
-	Items  []*Tensor
-	Offset uint64
-
-	layers     map[string]Layer
-	layersOnce sync.Once
-}
-
-func (ts *Tensors) Layers() map[string]Layer {
-	ts.layersOnce.Do(func() {
-		ts.layers = make(map[string]Layer)
-		for _, t := range ts.Items {
-			parts := strings.Split(t.Name, ".")
-			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
-				if len(parts) > index+2 {
-					// blk and mm should have a number after them, join it
-					parts = append(
-						[]string{strings.Join(parts[:index+2], ".")},
-						parts[index+2:]...)
-				}
-			}
-
-			if _, ok := ts.layers[parts[0]]; !ok {
-				ts.layers[parts[0]] = make(Layer)
-			}
-
-			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
-		}
-	})
-
-	return ts.layers
-}
-
-type Layer map[string]*Tensor
-
-func (l Layer) size() (size uint64) {
-	for _, t := range l {
-		size += t.Size()
-	}
-
-	return size
-}
-
-type Tensor struct {
-	Name   string `json:"name"`
-	Kind   uint32 `json:"kind"`
-	Offset uint64 `json:"-"`
-
-	// Shape is the number of elements in each dimension
-	Shape []uint64 `json:"shape"`
-
-	io.WriterTo `json:"-"`
-}
-
-func (t Tensor) block() (n int) {
-	if _, err := fmt.Sscanf(t.Name, "blk.%d.", &n); err != nil {
-		return -1
-	}
-
-	return
-}
-
-func (t Tensor) blockSize() uint64 {
-	switch t.Kind {
-	case 0, 1, 24, 25, 26, 27, 28, 30: // F32, F16, I8, I16, I32, I64, F64, BF16
-		return 1
-	case 2, 3, 4, 5, 6, 7, 8, 9, 20: // Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1, IQ4_NL
-		return 32
-	default: // All others
-		return 256
-	}
-}
-
-func (t Tensor) typeSize() uint64 {
-	blockSize := t.blockSize()
-
-	switch t.Kind {
-	case 0: // FP32
-		return 4
-	case 1: // FP16
-		return 2
-	case 2: // Q4_0
-		return 2 + blockSize/2
-	case 3: // Q4_1
-		return 2 + 2 + blockSize/2
-	case 6: // Q5_0
-		return 2 + 4 + blockSize/2
-	case 7: // Q5_1
-		return 2 + 2 + 4 + blockSize/2
-	case 8: // Q8_0
-		return 2 + blockSize
-	case 9: // Q8_1
-		return 4 + 4 + blockSize
-	case 10: // Q2_K
-		return blockSize/16 + blockSize/4 + 2 + 2
-	case 11: // Q3_K
-		return blockSize/8 + blockSize/4 + 12 + 2
-	case 12: // Q4_K
-		return 2 + 2 + 12 + blockSize/2
-	case 13: // Q5_K
-		return 2 + 2 + 12 + blockSize/8 + blockSize/2
-	case 14: // Q6_K
-		return blockSize/2 + blockSize/4 + blockSize/16 + 2
-	case 15: // Q8_K
-		return 2 + blockSize + 2*blockSize/16
-	case 16: // IQ2_XXS
-		return 2 + 2*blockSize/8
-	case 17: // IQ2_XS
-		return 2 + 2*blockSize/8 + blockSize/32
-	case 18: // IQ3_XXS
-		return 2 + blockSize/4 + blockSize/8
-	case 19: // IQ1_S
-		return 2 + blockSize/8 + blockSize/16
-	case 20: // IQ4_NL
-		return 2 + blockSize/2
-	case 21: // IQ3_S
-		return 2 + blockSize/4 + blockSize/8 + blockSize/32 + 4
-	case 22: // IQ2_S
-		return 2 + blockSize/4 + blockSize/16
-	case 23: // IQ4_XS
-		return 2 + 2 + blockSize/2 + blockSize/64
-	case 24: // I8
-		return 1
-	case 25: // I16
-		return 2
-	case 26: // I32
-		return 4
-	case 27: // I64
-		return 8
-	case 28: // F64
-		return 8
-	case 29: // IQ1_M
-		return blockSize/8 + blockSize/16 + blockSize/32
-	case 30: // BF16
-		return 2
-	default:
-		return 0
-	}
-}
-
-func (t Tensor) parameters() uint64 {
-	var count uint64 = 1
-	for _, n := range t.Shape {
-		count *= n
-	}
-	return count
-}
-
-func (t Tensor) Size() uint64 {
-	return t.parameters() * t.typeSize() / t.blockSize()
-}
-
-type container interface {
-	Name() string
-	Decode(io.ReadSeeker) (model, error)
-}
-
-const (
-	// Magic constant for `ggml` files (unversioned).
-	FILE_MAGIC_GGML = 0x67676d6c
-	// Magic constant for `ggml` files (versioned, ggmf).
-	FILE_MAGIC_GGMF = 0x67676d66
-	// Magic constant for `ggml` files (versioned, ggjt).
-	FILE_MAGIC_GGJT = 0x67676a74
-	// Magic constant for `ggla` files (LoRA adapter).
-	FILE_MAGIC_GGLA = 0x67676C61
-	// Magic constant for `gguf` files (versioned, gguf)
-	FILE_MAGIC_GGUF_LE = 0x46554747
-	FILE_MAGIC_GGUF_BE = 0x47475546
-)
-
-var ErrUnsupportedFormat = errors.New("unsupported model format")
-
-func DetectGGMLType(b []byte) string {
-	switch binary.LittleEndian.Uint32(b[:4]) {
-	case FILE_MAGIC_GGML:
-		return "ggml"
-	case FILE_MAGIC_GGMF:
-		return "ggmf"
-	case FILE_MAGIC_GGJT:
-		return "ggjt"
-	case FILE_MAGIC_GGLA:
-		return "ggla"
-	case FILE_MAGIC_GGUF_LE, FILE_MAGIC_GGUF_BE:
-		return "gguf"
-	default:
-		return ""
-	}
-}
-
-// DecodeGGML decodes a GGML model from the given reader.
-//
-// It collects array values for arrays with a size less than or equal to
-// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
-// the maxArraySize is negative, all arrays are collected.
-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
-	if maxArraySize == 0 {
-		maxArraySize = 1024
-	}
-
-	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
-
-	var magic uint32
-	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
-	}
-
-	var c container
-	switch magic {
-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
-		return nil, 0, ErrUnsupportedFormat
-	case FILE_MAGIC_GGLA:
-		c = &containerGGLA{}
-	case FILE_MAGIC_GGUF_LE:
-		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
-	case FILE_MAGIC_GGUF_BE:
-		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
-	default:
-		return nil, 0, errors.New("invalid file magic")
-	}
-
-	model, err := c.Decode(rs)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	// final model type
-	return &GGML{
-		container: c,
-		model:     model,
-	}, offset, nil
-}
-
-func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
-	embedding := llm.KV().EmbeddingLength()
-	heads := llm.KV().HeadCount()
-	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
-
-	embeddingHeads := llm.KV().EmbeddingHeadCount()
-	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
-	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
-
-	layers := llm.Tensors().Layers()
-
-	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
-
-	switch llm.KV().Architecture() {
-	case "llama":
-		fullOffload = max(
-			4*batch*(1+4*embedding+context*(1+heads)),
-			4*batch*(embedding+vocab),
-		)
-
-		partialOffload = 4 * batch * embedding
-		partialOffload += max(
-			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-		)
-
-		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
-			// mixtral 8x22b
-			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
-			partialOffload = max(
-				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
-				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
-			)
-		} else if ffnGateWeight, ok := layers["blk.0"]["ffn_gate.0.weight"]; ok {
-			// mixtral 8x7b
-			ffnGateWeight1 := ffnGateWeight.Shape[1]
-			fullOffload = 4 * batch * (2 + 3*embedding + context*(1+heads) + 2*headsKV + ffnGateWeight1)
-			partialOffload = max(
-				4*batch*(3+embeddingHeads*headsKV+embedding+context*(1+heads)+ffnGateWeight1)+(embedding*embedding+3*embedding*headsKV*ffnGateWeight1)*9/16,
-				4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
-			)
-		}
-	case "mllama":
-		var visionTokens, tiles uint64 = 1601, 4
-
-		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
-			kv = headsKV *
-				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
-				(2* // sizeof(float16)
-					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
-					context +
-					4* // sizeof(float32)
-						uint64(crossAttentionLayers.size)* // num cross attention layers
-						visionTokens*
-						tiles)
-		}
-
-		fullOffload = max(
-			4*batch*(2+3*embedding+embeddingHeadsK*heads+context*(1+heads)),
-			// vocab graph
-			4*batch*(embedding+vocab),
-		)
-
-		var ropeFreqsCount uint64
-		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
-			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
-				ropeFreqsCount = ropeFreqsWeights.parameters()
-			}
-		}
-
-		partialOffload = max(
-			4*(batch*
-				(2*embedding+1+context*(1+heads)+embeddingHeadsK*heads)+
-				ropeFreqsCount+
-				embeddingHeadsK*context*headsKV),
-			// vocab graph
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-		)
-	case "gemma", "gemma2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
-		)
-
-		partialOffload = max(
-			4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
-			4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
-				4*embeddingHeadsK*context*8+
-				embedding*embeddingHeadsK*heads*9/16,
-		)
-	case "command-r":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(2+4*embedding+context*(1+heads)),
-		)
-
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(1+2*embedding+context*(1+heads))+4*embedding*context+embedding*embedding*9/16,
-		)
-	case "qwen2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+2*embedding+context+context*heads),
-		)
-
-		partialOffload = max(
-			4*batch*(embedding+vocab)+embedding*vocab*105/128,
-			4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
-		)
-	case "phi2":
-		fullOffload = max(
-			4*batch*(embedding+vocab),
-			4*batch*(1+4*embedding+context+context*heads),
-		)
-
-		partialOffload = max(
-			4*batch*(2*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2+3*embedding+context+context*heads),
-		)
-	case "stablelm":
-		fullOffload = 4 * batch * (context*(1+heads) + 3*embedding + 2)
-		partialOffload = max(
-			4*batch*(vocab+2*embedding),
-			fullOffload,
-		)
-	case "deepseek2":
-		fullOffload = max(
-			4*batch*(3*embedding+vocab),
-			4*batch*(3*embedding+2+context*(1+headsKV)+2*embeddingHeadsK*headsKV),
-		)
-
-		partialOffload = max(
-			4*batch*(3*embedding+vocab)+embedding*vocab*105/128,
-			4*batch*(2*embedding+1+2*embeddingHeadsK*headsKV+context+context*headsKV)+4*embeddingHeadsK*context*headsKV+embedding*embeddingHeadsK*headsKV*9/16,
-		)
-	case "chatglm":
-		fullOffload = 4 * batch * (embedding + vocab)
-		partialOffload = 4*batch*(embedding+vocab) + embedding*vocab*105/128
-		if qkvBias, ok := layers["blk.0"]["attn_qkv.bias"]; ok {
-			fullOffload = max(
-				fullOffload,
-				4*batch*(2+
-					2*embedding+
-					context+
-					context*heads+
-					embeddingHeadsK*heads+
-					qkvBias.Shape[0]),
-			)
-
-			partialOffload = max(
-				partialOffload,
-				4*batch*(1+
-					2*embedding+
-					embeddingHeadsK*heads+
-					context+
-					context*heads)+
-					4*embeddingHeadsK*context+
-					4*context*embeddingHeadsK+
-					4*qkvBias.Shape[0],
-			)
-		}
-	}
-
-	return
-}
-
-// SupportsKVCacheType checks if the requested cache type is supported
-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
-	return slices.Contains(validKVCacheTypes, cacheType)
-}
-
-// SupportsFlashAttention checks if the model supports flash attention
-func (ggml GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
-	if isEmbedding {
-		return false
-	}
-
-	// Check head counts match and are non-zero
-	headCountK := ggml.KV().EmbeddingHeadCountK()
-	headCountV := ggml.KV().EmbeddingHeadCountV()
-	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
-}
-
-// kvCacheBytesPerElement returns the number of bytes per element for a given KV cache type
-func kvCacheBytesPerElement(cacheType string) float64 {
-	switch cacheType {
-	case "q8_0":
-		return 1 // 1/2 of fp16
-	case "q4_0":
-		return 0.5 // 1/4 of fp16
-	default:
-		return 2 // f16 (default)
-	}
-}

+ 0 - 1
llm/ggml_test.go

@@ -1 +0,0 @@
-package llm

+ 0 - 662
llm/gguf.go

@@ -1,662 +0,0 @@
-package llm
-
-import (
-	"bytes"
-	"cmp"
-	"encoding/binary"
-	"encoding/json"
-	"fmt"
-	"io"
-	"log/slog"
-	"slices"
-	"strings"
-
-	"golang.org/x/exp/maps"
-)
-
-type containerGGUF struct {
-	ByteOrder binary.ByteOrder
-
-	Version uint32
-
-	V1 struct {
-		NumTensor uint32
-		NumKV     uint32
-	}
-
-	V2 struct {
-		NumTensor uint64
-		NumKV     uint64
-	}
-
-	V3 struct {
-		NumTensor uint64
-		NumKV     uint64
-	}
-
-	maxArraySize int
-}
-
-func (c *containerGGUF) canCollectArray(size int) bool {
-	return c.maxArraySize < 0 || size <= c.maxArraySize
-}
-
-func (c *containerGGUF) Name() string {
-	return "gguf"
-}
-
-func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
-	if err := binary.Read(rs, c.ByteOrder, &c.Version); err != nil {
-		return nil, err
-	}
-
-	var err error
-	switch c.Version {
-	case 1:
-		err = binary.Read(rs, c.ByteOrder, &c.V1)
-	case 2:
-		err = binary.Read(rs, c.ByteOrder, &c.V2)
-	default:
-		err = binary.Read(rs, c.ByteOrder, &c.V3)
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	model := newGGUF(c)
-	if err := model.Decode(rs); err != nil {
-		return nil, err
-	}
-
-	return model, nil
-}
-
-const (
-	ggufTypeUint8 uint32 = iota
-	ggufTypeInt8
-	ggufTypeUint16
-	ggufTypeInt16
-	ggufTypeUint32
-	ggufTypeInt32
-	ggufTypeFloat32
-	ggufTypeBool
-	ggufTypeString
-	ggufTypeArray
-	ggufTypeUint64
-	ggufTypeInt64
-	ggufTypeFloat64
-)
-
-type gguf struct {
-	*containerGGUF
-
-	kv      KV
-	tensors []*Tensor
-
-	parameters   uint64
-	tensorOffset uint64
-
-	scratch [16 << 10]byte
-}
-
-func newGGUF(container *containerGGUF) *gguf {
-	return &gguf{
-		containerGGUF: container,
-		kv:            make(KV),
-	}
-}
-
-func (llm *gguf) KV() KV {
-	return llm.kv
-}
-
-func (llm *gguf) Tensors() *Tensors {
-	return &Tensors{
-		Items:  llm.tensors,
-		Offset: llm.tensorOffset,
-	}
-}
-
-func (llm *gguf) numTensor() uint64 {
-	switch llm.Version {
-	case 1:
-		return uint64(llm.V1.NumTensor)
-	case 2:
-		return llm.V2.NumTensor
-	default:
-		return llm.V3.NumTensor
-	}
-}
-
-func (llm *gguf) numKV() uint64 {
-	switch llm.Version {
-	case 1:
-		return uint64(llm.V1.NumKV)
-	case 2:
-		return llm.V2.NumKV
-	default:
-		return llm.V3.NumKV
-	}
-}
-
-func (llm *gguf) Decode(rs io.ReadSeeker) error {
-	// decode key-values
-	for i := 0; uint64(i) < llm.numKV(); i++ {
-		k, err := readGGUFString(llm, rs)
-		if err != nil {
-			return err
-		}
-
-		t, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return err
-		}
-
-		var v any
-		switch t {
-		case ggufTypeUint8:
-			v, err = readGGUF[uint8](llm, rs)
-		case ggufTypeInt8:
-			v, err = readGGUF[int8](llm, rs)
-		case ggufTypeUint16:
-			v, err = readGGUF[uint16](llm, rs)
-		case ggufTypeInt16:
-			v, err = readGGUF[int16](llm, rs)
-		case ggufTypeUint32:
-			v, err = readGGUF[uint32](llm, rs)
-		case ggufTypeInt32:
-			v, err = readGGUF[int32](llm, rs)
-		case ggufTypeUint64:
-			v, err = readGGUF[uint64](llm, rs)
-		case ggufTypeInt64:
-			v, err = readGGUF[int64](llm, rs)
-		case ggufTypeFloat32:
-			v, err = readGGUF[float32](llm, rs)
-		case ggufTypeFloat64:
-			v, err = readGGUF[float64](llm, rs)
-		case ggufTypeBool:
-			v, err = readGGUF[bool](llm, rs)
-		case ggufTypeString:
-			v, err = readGGUFString(llm, rs)
-		case ggufTypeArray:
-			v, err = readGGUFArray(llm, rs)
-		default:
-			return fmt.Errorf("invalid type: %d", t)
-		}
-
-		if err != nil {
-			return err
-		}
-
-		llm.kv[k] = v
-	}
-
-	// decode tensors
-	for range llm.numTensor() {
-		name, err := readGGUFString(llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor name: %w", err)
-		}
-
-		// dims is the number of dimensions in the tensor
-		dims, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor dimensions: %w", err)
-		}
-
-		shape := make([]uint64, dims)
-		for i := 0; uint32(i) < dims; i++ {
-			shape[i], err = readGGUF[uint64](llm, rs)
-			if err != nil {
-				return fmt.Errorf("failed to read tensor shape: %w", err)
-			}
-		}
-
-		kind, err := readGGUF[uint32](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor kind: %w", err)
-		}
-
-		offset, err := readGGUF[uint64](llm, rs)
-		if err != nil {
-			return fmt.Errorf("failed to read tensor offset: %w", err)
-		}
-
-		tensor := Tensor{
-			Name:   name,
-			Kind:   kind,
-			Offset: offset,
-			Shape:  shape[:],
-		}
-
-		llm.tensors = append(llm.tensors, &tensor)
-		llm.parameters += tensor.parameters()
-	}
-
-	// patch KV with parameter count
-	llm.kv["general.parameter_count"] = llm.parameters
-
-	alignment, ok := llm.kv["general.alignment"].(uint32)
-	if !ok {
-		alignment = 32
-	}
-
-	offset, err := rs.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	padding := ggufPadding(offset, int64(alignment))
-	llm.tensorOffset = uint64(offset + padding)
-
-	for _, tensor := range llm.tensors {
-		offset, err := rs.Seek(0, io.SeekCurrent)
-		if err != nil {
-			return fmt.Errorf("failed to get current offset: %w", err)
-		}
-
-		padding := ggufPadding(offset, int64(alignment))
-		if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
-			return fmt.Errorf("failed to seek to init padding: %w", err)
-		}
-
-		if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
-			return fmt.Errorf("failed to seek to tensor: %w", err)
-		}
-	}
-
-	return nil
-}
-
-func readGGUF[T any](llm *gguf, r io.Reader) (T, error) {
-	var t T
-	err := binary.Read(r, llm.ByteOrder, &t)
-	return t, err
-}
-
-func writeGGUF[V any](w io.Writer, t uint32, v V) error {
-	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
-		return err
-	}
-
-	return binary.Write(w, binary.LittleEndian, v)
-}
-
-func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
-	var length uint64
-	if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
-		return "", err
-	}
-
-	var b bytes.Buffer
-	if _, err := io.CopyN(&b, r, int64(length)); err != nil {
-		return "", err
-	}
-
-	// gguf v1 strings are null-terminated
-	b.Truncate(b.Len() - 1)
-
-	return b.String(), nil
-}
-
-func discardGGUFString(llm *gguf, r io.Reader) error {
-	buf := llm.scratch[:8]
-	_, err := io.ReadFull(r, buf)
-	if err != nil {
-		return err
-	}
-
-	size := int(llm.ByteOrder.Uint64(buf))
-	for size > 0 {
-		n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
-		if err != nil {
-			return err
-		}
-		size -= n
-	}
-	return nil
-}
-
-func readGGUFString(llm *gguf, r io.Reader) (string, error) {
-	if llm.Version == 1 {
-		return readGGUFV1String(llm, r)
-	}
-
-	buf := llm.scratch[:8]
-	_, err := io.ReadFull(r, buf)
-	if err != nil {
-		return "", err
-	}
-
-	length := int(llm.ByteOrder.Uint64(buf))
-	if length > len(llm.scratch) {
-		buf = make([]byte, length)
-	} else {
-		buf = llm.scratch[:length]
-	}
-	clear(buf)
-
-	_, err = io.ReadFull(r, buf)
-	if err != nil {
-		return "", err
-	}
-	return string(buf), nil
-}
-
-func writeGGUFString(w io.Writer, s string) error {
-	if err := binary.Write(w, binary.LittleEndian, ggufTypeString); err != nil {
-		return err
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
-		return err
-	}
-
-	_, err := io.Copy(w, strings.NewReader(s))
-	return err
-}
-
-type array struct {
-	size   int
-	values []any
-}
-
-func (a *array) MarshalJSON() ([]byte, error) {
-	return json.Marshal(a.values)
-}
-
-func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
-	t, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-
-	a := &array{size: int(n)}
-	if llm.canCollectArray(int(n)) {
-		a.values = make([]any, 0, int(n))
-	}
-
-	for i := range n {
-		var e any
-		switch t {
-		case ggufTypeUint8:
-			e, err = readGGUF[uint8](llm, r)
-		case ggufTypeInt8:
-			e, err = readGGUF[int8](llm, r)
-		case ggufTypeUint16:
-			e, err = readGGUF[uint16](llm, r)
-		case ggufTypeInt16:
-			e, err = readGGUF[int16](llm, r)
-		case ggufTypeUint32:
-			e, err = readGGUF[uint32](llm, r)
-		case ggufTypeInt32:
-			e, err = readGGUF[int32](llm, r)
-		case ggufTypeUint64:
-			e, err = readGGUF[uint64](llm, r)
-		case ggufTypeInt64:
-			e, err = readGGUF[int64](llm, r)
-		case ggufTypeFloat32:
-			e, err = readGGUF[float32](llm, r)
-		case ggufTypeFloat64:
-			e, err = readGGUF[float64](llm, r)
-		case ggufTypeBool:
-			e, err = readGGUF[bool](llm, r)
-		case ggufTypeString:
-			e, err = readGGUFV1String(llm, r)
-		default:
-			return nil, fmt.Errorf("invalid array type: %d", t)
-		}
-		if err != nil {
-			return nil, err
-		}
-
-		if a.values != nil {
-			a.values[i] = e
-		}
-	}
-
-	return a, nil
-}
-
-func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
-	if llm.Version == 1 {
-		return readGGUFV1Array(llm, r)
-	}
-
-	t, err := readGGUF[uint32](llm, r)
-	if err != nil {
-		return nil, err
-	}
-
-	n, err := readGGUF[uint64](llm, r)
-	if err != nil {
-		return nil, err
-	}
-
-	a := &array{size: int(n)}
-	if llm.canCollectArray(int(n)) {
-		a.values = make([]any, int(n))
-	}
-
-	for i := range n {
-		var e any
-		switch t {
-		case ggufTypeUint8:
-			e, err = readGGUF[uint8](llm, r)
-		case ggufTypeInt8:
-			e, err = readGGUF[int8](llm, r)
-		case ggufTypeUint16:
-			e, err = readGGUF[uint16](llm, r)
-		case ggufTypeInt16:
-			e, err = readGGUF[int16](llm, r)
-		case ggufTypeUint32:
-			e, err = readGGUF[uint32](llm, r)
-		case ggufTypeInt32:
-			e, err = readGGUF[int32](llm, r)
-		case ggufTypeUint64:
-			e, err = readGGUF[uint64](llm, r)
-		case ggufTypeInt64:
-			e, err = readGGUF[int64](llm, r)
-		case ggufTypeFloat32:
-			e, err = readGGUF[float32](llm, r)
-		case ggufTypeFloat64:
-			e, err = readGGUF[float64](llm, r)
-		case ggufTypeBool:
-			e, err = readGGUF[bool](llm, r)
-		case ggufTypeString:
-			if a.values != nil {
-				e, err = readGGUFString(llm, r)
-			} else {
-				err = discardGGUFString(llm, r)
-			}
-		default:
-			return nil, fmt.Errorf("invalid array type: %d", t)
-		}
-		if err != nil {
-			return nil, err
-		}
-
-		if a.values != nil {
-			a.values[i] = e
-		}
-	}
-
-	return a, nil
-}
-
-// writeGGUFArray writes a slice s of type E to the write with a gguf type of t
-func writeGGUFArray[S ~[]E, E any](w io.Writer, t uint32, s S) error {
-	if err := binary.Write(w, binary.LittleEndian, ggufTypeArray); err != nil {
-		return err
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, t); err != nil {
-		return err
-	}
-
-	if err := binary.Write(w, binary.LittleEndian, uint64(len(s))); err != nil {
-		return err
-	}
-
-	return binary.Write(w, binary.LittleEndian, s)
-}
-
-func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
-	if err := binary.Write(ws, binary.LittleEndian, []byte("GGUF")); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, uint32(3)); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(ts))); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(kv))); err != nil {
-		return err
-	}
-
-	keys := maps.Keys(kv)
-	slices.Sort(keys)
-
-	for _, key := range keys {
-		if err := ggufWriteKV(ws, key, kv[key]); err != nil {
-			return err
-		}
-	}
-
-	slices.SortStableFunc(ts, func(a, b Tensor) int {
-		if i, j := a.block(), b.block(); i < 0 && j > 0 {
-			return 1
-		} else if i > 0 && j < 0 {
-			return -1
-		} else {
-			return cmp.Compare(i, j)
-		}
-	})
-
-	var s uint64
-	for _, t := range ts {
-		t.Offset = s
-		if err := ggufWriteTensorInfo(ws, t); err != nil {
-			return err
-		}
-		s += t.Size()
-	}
-
-	var alignment int64 = 32
-	for _, t := range ts {
-		if err := ggufWriteTensor(ws, t, alignment); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func ggufWriteKV(ws io.WriteSeeker, k string, v any) error {
-	slog.Debug(k, "type", fmt.Sprintf("%T", v))
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(k))); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, []byte(k)); err != nil {
-		return err
-	}
-
-	var err error
-	switch v := v.(type) {
-	case uint32:
-		err = writeGGUF(ws, ggufTypeUint32, v)
-	case float32:
-		err = writeGGUF(ws, ggufTypeFloat32, v)
-	case bool:
-		err = writeGGUF(ws, ggufTypeBool, v)
-	case string:
-		err = writeGGUFString(ws, v)
-	case []int32:
-		err = writeGGUFArray(ws, ggufTypeInt32, v)
-	case []uint32:
-		err = writeGGUFArray(ws, ggufTypeUint32, v)
-	case []float32:
-		err = writeGGUFArray(ws, ggufTypeFloat32, v)
-	case []string:
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeArray); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, binary.LittleEndian, ggufTypeString); err != nil {
-			return err
-		}
-
-		if err := binary.Write(ws, binary.LittleEndian, uint64(len(v))); err != nil {
-			return err
-		}
-
-		for _, e := range v {
-			if err := binary.Write(ws, binary.LittleEndian, uint64(len(e))); err != nil {
-				return err
-			}
-
-			if err := binary.Write(ws, binary.LittleEndian, []byte(e)); err != nil {
-				return err
-			}
-		}
-	default:
-		return fmt.Errorf("improper type for '%s'", k)
-	}
-
-	return err
-}
-
-func ggufWriteTensorInfo(ws io.WriteSeeker, t Tensor) error {
-	slog.Debug(t.Name, "kind", t.Kind, "shape", t.Shape, "offset", t.Offset)
-	if err := binary.Write(ws, binary.LittleEndian, uint64(len(t.Name))); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, []byte(t.Name)); err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, uint32(len(t.Shape))); err != nil {
-		return err
-	}
-
-	for i := range len(t.Shape) {
-		if err := binary.Write(ws, binary.LittleEndian, t.Shape[len(t.Shape)-i-1]); err != nil {
-			return err
-		}
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, t.Kind); err != nil {
-		return err
-	}
-
-	return binary.Write(ws, binary.LittleEndian, t.Offset)
-}
-
-func ggufWriteTensor(ws io.WriteSeeker, t Tensor, alignment int64) error {
-	offset, err := ws.Seek(0, io.SeekCurrent)
-	if err != nil {
-		return err
-	}
-
-	if err := binary.Write(ws, binary.LittleEndian, bytes.Repeat([]byte{0}, int(ggufPadding(offset, alignment)))); err != nil {
-		return err
-	}
-
-	_, err = t.WriteTo(ws)
-	return err
-}
-
-func ggufPadding(offset, align int64) int64 {
-	return (align - offset%align) % align
-}

+ 26 - 27
llm/memory.go

@@ -11,18 +11,19 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 // This algorithm looks for a complete fit to determine if we need to unload other models
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
 		var layerCount int
-		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate := EstimateGPULayers(gpus, f, projectors, opts)
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 		if opts.NumGPU < 0 {
-			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
+			if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
 				return true, estimatedVRAM
 				return true, estimatedVRAM
 			}
 			}
 		} else {
 		} else {
@@ -70,7 +71,7 @@ type MemoryEstimate struct {
 
 
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64
 	var graphPartialOffload uint64
 
 
@@ -115,33 +116,31 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 		opts.NumCtx = max(opts.NumCtx, 2048)
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 	}
 
 
-	layers := ggml.Tensors().Layers()
+	layers := f.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
 	if blk0, ok := layers["blk.0"]; ok {
-		layerSize = blk0.size()
+		layerSize = blk0.Size()
 	} else {
 	} else {
 		slog.Warn("model missing blk.0 layer size")
 		slog.Warn("model missing blk.0 layer size")
 	}
 	}
 
 
-	fa := envconfig.FlashAttention() &&
-		discover.GetGPUInfo().FlashAttentionSupported() &&
-		ggml.SupportsFlashAttention()
-
 	var kvct string
 	var kvct string
-	if fa {
+	if envconfig.FlashAttention() &&
+		discover.GetGPUInfo().FlashAttentionSupported() &&
+		f.SupportsFlashAttention() {
 		requested := strings.ToLower(envconfig.KvCacheType())
 		requested := strings.ToLower(envconfig.KvCacheType())
-		if requested != "" && ggml.SupportsKVCacheType(requested) {
+		if requested != "" && f.SupportsKVCacheType(requested) {
 			kvct = requested
 			kvct = requested
 		}
 		}
 	}
 	}
 
 
-	kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
+	kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), kvct)
 
 
 	// KV is proportional to the number of layers
 	// KV is proportional to the number of layers
-	layerSize += kv / ggml.KV().BlockCount()
+	layerSize += kv / f.KV().BlockCount()
 
 
 	if graphPartialOffload == 0 {
 	if graphPartialOffload == 0 {
-		graphPartialOffload = ggml.KV().GQA() * kv / 6
+		graphPartialOffload = f.KV().GQA() * kv / 6
 	}
 	}
 	if graphFullOffload == 0 {
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
 		graphFullOffload = graphPartialOffload
@@ -156,12 +155,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	}
 	}
 
 
 	if layer, ok := layers["output_norm"]; ok {
 	if layer, ok := layers["output_norm"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	}
 	}
 	if layer, ok := layers["output"]; ok {
 	if layer, ok := layers["output"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	} else if layer, ok := layers["token_embd"]; ok {
 	} else if layer, ok := layers["token_embd"]; ok {
-		memoryLayerOutput += layer.size()
+		memoryLayerOutput += layer.Size()
 	}
 	}
 
 
 	// Output layer handled at the end if we have space
 	// Output layer handled at the end if we have space
@@ -211,11 +210,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	}
 	}
 
 
 	// For all the layers, find where they can fit on the GPU(s)
 	// For all the layers, find where they can fit on the GPU(s)
-	for i := range int(ggml.KV().BlockCount()) {
+	for i := range int(f.KV().BlockCount()) {
 		// Some models have inconsistent layer sizes
 		// Some models have inconsistent layer sizes
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			layerSize = blk.size()
-			layerSize += kv / ggml.KV().BlockCount()
+			layerSize = blk.Size()
+			layerSize += kv / f.KV().BlockCount()
 		}
 		}
 		memoryWeights += layerSize
 		memoryWeights += layerSize
 
 
@@ -238,10 +237,10 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 			}
 			}
 		}
 		}
 	}
 	}
-	if layerCount >= int(ggml.KV().BlockCount()) {
+	if layerCount >= int(f.KV().BlockCount()) {
 		fullyLoaded = true
 		fullyLoaded = true
 	} else {
 	} else {
-		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
+		for i := layerCount; i < int(f.KV().BlockCount()); i++ {
 			overflow += layerSize
 			overflow += layerSize
 		}
 		}
 	}
 	}
@@ -259,7 +258,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 			}
 			}
 		}
 		}
 
 
-		if layerCount < int(ggml.KV().BlockCount())+1 {
+		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
 			fullyLoaded = false
 			overflow += memoryLayerOutput
 			overflow += memoryLayerOutput
 		}
 		}
@@ -311,7 +310,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
 
 		inferenceLibrary:    gpus[0].Library,
 		inferenceLibrary:    gpus[0].Library,
 		layersRequested:     opts.NumGPU,
 		layersRequested:     opts.NumGPU,
-		layersModel:         int(ggml.KV().BlockCount()) + 1,
+		layersModel:         int(f.KV().BlockCount()) + 1,
 		availableList:       availableList,
 		availableList:       availableList,
 		kv:                  kv,
 		kv:                  kv,
 		allocationsList:     allocationsList,
 		allocationsList:     allocationsList,
@@ -409,13 +408,13 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 	}
 	}
 	defer file.Close()
 	defer file.Close()
 
 
-	ggml, _, err := DecodeGGML(file, 0)
+	ggml, _, err := ggml.Decode(file, 0)
 	if err != nil {
 	if err != nil {
 		return 0, 0
 		return 0, 0
 	}
 	}
 
 
 	for _, layer := range ggml.Tensors().Layers() {
 	for _, layer := range ggml.Tensors().Layers() {
-		weights += layer.size()
+		weights += layer.Size()
 	}
 	}
 
 
 	switch arch := ggml.KV().Architecture(); arch {
 	switch arch := ggml.KV().Architecture(); arch {

+ 3 - 2
llm/memory_test.go

@@ -11,6 +11,7 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 func TestEstimateGPULayers(t *testing.T) {
 func TestEstimateGPULayers(t *testing.T) {
@@ -23,7 +24,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	defer f.Close()
 	defer f.Close()
 	inputLayerCount := 5
 	inputLayerCount := 5
 
 
-	tensors := []Tensor{
+	tensors := []ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
@@ -32,7 +33,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}
 	}
 	assert.Len(t, tensors, inputLayerCount+1)
 	assert.Len(t, tensors, inputLayerCount+1)
-	err = WriteGGUF(f, KV{
+	err = ggml.WriteGGUF(f, ggml.KV{
 		"general.architecture":          "llama",
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.embedding_length":        uint32(4096),

+ 10 - 9
llm/server.go

@@ -28,6 +28,7 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/runners"
 )
 )
@@ -72,7 +73,7 @@ type llmServer struct {
 // It collects array values for arrays with a size less than or equal to
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
 // the maxArraySize is negative, all arrays are collected.
 // the maxArraySize is negative, all arrays are collected.
-func LoadModel(model string, maxArraySize int) (*GGML, error) {
+func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	if _, err := os.Stat(model); err != nil {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -83,13 +84,13 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 	}
 	}
 	defer f.Close()
 	defer f.Close()
 
 
-	ggml, _, err := DecodeGGML(f, maxArraySize)
+	ggml, _, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 	return ggml, err
 }
 }
 
 
 // NewLlamaServer will run a server for the given GPUs
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
 // The gpu list must be a single family.
-func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var err error
 	var cpuRunner string
 	var cpuRunner string
 	var estimate MemoryEstimate
 	var estimate MemoryEstimate
@@ -109,9 +110,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = runners.ServerForCpu()
 		cpuRunner = runners.ServerForCpu()
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, f, projectors, opts)
 	} else {
 	} else {
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, f, projectors, opts)
 
 
 		switch {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
@@ -212,7 +213,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		fa = false
 		fa = false
 	}
 	}
 
 
-	if fa && !ggml.SupportsFlashAttention() {
+	if fa && !f.SupportsFlashAttention() {
 		slog.Warn("flash attention enabled but not supported by model")
 		slog.Warn("flash attention enabled but not supported by model")
 		fa = false
 		fa = false
 	}
 	}
@@ -225,7 +226,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
 
 		// Flash Attention also supports kv cache quantization
 		// Flash Attention also supports kv cache quantization
 		// Enable if the requested and kv cache type is supported by the model
 		// Enable if the requested and kv cache type is supported by the model
-		if kvct != "" && ggml.SupportsKVCacheType(kvct) {
+		if kvct != "" && f.SupportsKVCacheType(kvct) {
 			params = append(params, "--kv-cache-type", kvct)
 			params = append(params, "--kv-cache-type", kvct)
 		} else {
 		} else {
 			slog.Warn("kv cache type not supported by model", "type", kvct)
 			slog.Warn("kv cache type not supported by model", "type", kvct)
@@ -238,7 +239,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	for _, g := range gpus {
 	for _, g := range gpus {
 		if g.Library == "metal" &&
 		if g.Library == "metal" &&
 			uint64(opts.NumGPU) > 0 &&
 			uint64(opts.NumGPU) > 0 &&
-			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
+			uint64(opts.NumGPU) < f.KV().BlockCount()+1 {
 			opts.UseMMap = new(bool)
 			opts.UseMMap = new(bool)
 			*opts.UseMMap = false
 			*opts.UseMMap = false
 		}
 		}
@@ -330,7 +331,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			estimate:    estimate,
 			estimate:    estimate,
 			numParallel: numParallel,
 			numParallel: numParallel,
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			sem:         semaphore.NewWeighted(int64(numParallel)),
-			totalLayers: ggml.KV().BlockCount() + 1,
+			totalLayers: f.KV().BlockCount() + 1,
 			gpus:        gpus,
 			gpus:        gpus,
 			done:        make(chan error, 1),
 			done:        make(chan error, 1),
 		}
 		}

+ 1 - 1
ml/backend/ggml/backend.go

@@ -29,7 +29,7 @@ type Backend struct {
 }
 }
 
 
 func New(r io.ReadSeeker) (ml.Backend, error) {
 func New(r io.ReadSeeker) (ml.Backend, error) {
-	f, _, err := ggml.DecodeGGML(r, -1)
+	f, _, err := ggml.Decode(r, -1)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}

+ 4 - 4
server/images.go

@@ -25,8 +25,8 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
@@ -89,7 +89,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 			defer f.Close()
 			defer f.Close()
 
 
 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
-			ggml, _, err := llm.DecodeGGML(f, 0)
+			ggml, _, err := ggml.Decode(f, 0)
 			if err != nil {
 			if err != nil {
 				slog.Error("couldn't decode ggml", "error", err)
 				slog.Error("couldn't decode ggml", "error", err)
 				continue
 				continue
@@ -429,7 +429,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
 					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
 					baseLayer.GGML != nil &&
 					baseLayer.GGML != nil &&
 					baseLayer.GGML.Name() == "gguf" {
 					baseLayer.GGML.Name() == "gguf" {
-					want, err := llm.ParseFileType(quantization)
+					want, err := ggml.ParseFileType(quantization)
 					if err != nil {
 					if err != nil {
 						return err
 						return err
 					}
 					}
@@ -465,7 +465,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 							return err
 						}
 						}
 
 
-						ggml, _, err := llm.DecodeGGML(temp, 0)
+						ggml, _, err := ggml.Decode(temp, 0)
 						if err != nil {
 						if err != nil {
 							return err
 							return err
 						}
 						}

+ 7 - 7
server/model.go

@@ -18,7 +18,7 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/convert"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
 )
 )
@@ -27,7 +27,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
 
 
 type layerGGML struct {
 type layerGGML struct {
 	Layer
 	Layer
-	*llm.GGML
+	*ggml.GGML
 }
 }
 
 
 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
@@ -67,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			}
 			defer blob.Close()
 			defer blob.Close()
 
 
-			ggml, _, err := llm.DecodeGGML(blob, 0)
+			ggml, _, err := ggml.Decode(blob, 0)
 			if err != nil {
 			if err != nil {
 				return nil, err
 				return nil, err
 			}
 			}
@@ -112,7 +112,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 
 
 	switch command {
 	switch command {
 	case "adapter":
 	case "adapter":
-		var baseModel *llm.GGML
+		var baseModel *ggml.GGML
 		for _, l := range baseLayers {
 		for _, l := range baseLayers {
 			if l.GGML != nil {
 			if l.GGML != nil {
 				baseModel = l.GGML
 				baseModel = l.GGML
@@ -150,7 +150,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 	}
 	}
 	defer bin.Close()
 	defer bin.Close()
 
 
-	ggml, _, err := llm.DecodeGGML(bin, 0)
+	ggml, _, err := ggml.Decode(bin, 0)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -184,7 +184,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
 
 
 	var offset int64
 	var offset int64
 	for offset < stat.Size() {
 	for offset < stat.Size() {
-		ggml, n, err := llm.DecodeGGML(file, 0)
+		ggml, n, err := ggml.Decode(file, 0)
 		if errors.Is(err, io.EOF) {
 		if errors.Is(err, io.EOF) {
 			break
 			break
 		} else if err != nil {
 		} else if err != nil {
@@ -263,7 +263,7 @@ func detectContentType(r io.Reader) (string, error) {
 		return "", err
 		return "", err
 	}
 	}
 
 
-	if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
+	if contentType := ggml.DetectContentType(b.Bytes()); contentType != "" {
 		return contentType, nil
 		return contentType, nil
 	}
 	}
 
 

+ 3 - 3
server/model_test.go

@@ -13,7 +13,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 )
 )
 
 
@@ -148,7 +148,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to open file: %v", err)
 		t.Fatalf("failed to open file: %v", err)
 	}
 	}
 	defer file.Close()
 	defer file.Close()
-	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+	if err := ggml.WriteGGUF(file, ggml.KV{"general.architecture": "gemma"}, []ggml.Tensor{}); err != nil {
 		t.Fatalf("failed to write gguf: %v", err)
 		t.Fatalf("failed to write gguf: %v", err)
 	}
 	}
 
 
@@ -201,7 +201,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 	defer file2.Close()
 	defer file2.Close()
 
 
 	for range 5 {
 	for range 5 {
-		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+		if err := ggml.WriteGGUF(file2, ggml.KV{"general.architecture": "gemma"}, []ggml.Tensor{}); err != nil {
 			t.Fatalf("failed to write gguf: %v", err)
 			t.Fatalf("failed to write gguf: %v", err)
 		}
 		}
 	}
 	}

+ 2 - 1
server/routes.go

@@ -29,6 +29,7 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
@@ -870,7 +871,7 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func getKVData(digest string, verbose bool) (llm.KV, error) {
+func getKVData(digest string, verbose bool) (ggml.KV, error) {
 	maxArraySize := 0
 	maxArraySize := 0
 	if verbose {
 	if verbose {
 		maxArraySize = -1
 		maxArraySize = -1

+ 4 - 4
server/routes_create_test.go

@@ -16,12 +16,12 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/gin-gonic/gin"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 var stream bool = false
 var stream bool = false
 
 
-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
+func createBinFile(t *testing.T, kv map[string]any, ti []ggml.Tensor) string {
 	t.Helper()
 	t.Helper()
 
 
 	f, err := os.CreateTemp(t.TempDir(), "")
 	f, err := os.CreateTemp(t.TempDir(), "")
@@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	}
 	}
 	defer f.Close()
 	defer f.Close()
 
 
-	if err := llm.WriteGGUF(f, kv, ti); err != nil {
+	if err := ggml.WriteGGUF(f, kv, ti); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
@@ -581,7 +581,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	t.Run("matched", func(t *testing.T) {
 	t.Run("matched", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name: "test",
 			Name: "test",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
 			}, nil)),
 			}, nil)),
 			Stream: &stream,
 			Stream: &stream,

+ 13 - 12
server/routes_generate_test.go

@@ -17,6 +17,7 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -46,8 +47,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 	return
 }
 }
 
 
-func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *ggml.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(gpus discover.GpuInfoList, model string, f *ggml.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return mock, nil
 		return mock, nil
 	}
 	}
 }
 }
@@ -77,7 +78,7 @@ func TestGenerateChat(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{
@@ -101,7 +102,7 @@ func TestGenerateChat(t *testing.T) {
 {{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{- range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}
 {{- end }}
 {{- end }}
 {{ end }}"""
 {{ end }}"""
-`, createBinFile(t, llm.KV{
+`, createBinFile(t, ggml.KV{
 			"general.architecture":          "llama",
 			"general.architecture":          "llama",
 			"llama.block_count":             uint32(1),
 			"llama.block_count":             uint32(1),
 			"llama.context_length":          uint32(8192),
 			"llama.context_length":          uint32(8192),
@@ -111,7 +112,7 @@ func TestGenerateChat(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []ggml.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -156,10 +157,10 @@ func TestGenerateChat(t *testing.T) {
 	t.Run("missing capabilities chat", func(t *testing.T) {
 	t.Run("missing capabilities chat", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{
 				"general.architecture": "bert",
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []ggml.Tensor{})),
 			Stream: &stream,
 			Stream: &stream,
 		})
 		})
 
 
@@ -610,7 +611,7 @@ func TestGenerate(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, _ *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{
@@ -629,7 +630,7 @@ func TestGenerate(t *testing.T) {
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
-`, createBinFile(t, llm.KV{
+`, createBinFile(t, ggml.KV{
 			"general.architecture":          "llama",
 			"general.architecture":          "llama",
 			"llama.block_count":             uint32(1),
 			"llama.block_count":             uint32(1),
 			"llama.context_length":          uint32(8192),
 			"llama.context_length":          uint32(8192),
@@ -639,7 +640,7 @@ func TestGenerate(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []ggml.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -684,10 +685,10 @@ func TestGenerate(t *testing.T) {
 	t.Run("missing capabilities generate", func(t *testing.T) {
 	t.Run("missing capabilities generate", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, ggml.KV{
 				"general.architecture": "bert",
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []ggml.Tensor{})),
 			Stream: &stream,
 			Stream: &stream,
 		})
 		})
 
 

+ 3 - 3
server/routes_test.go

@@ -21,7 +21,7 @@ import (
 	"unicode"
 	"unicode"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
@@ -612,8 +612,8 @@ func TestShow(t *testing.T) {
 		Name: "show-model",
 		Name: "show-model",
 		Modelfile: fmt.Sprintf(
 		Modelfile: fmt.Sprintf(
 			"FROM %s\nFROM %s",
 			"FROM %s\nFROM %s",
-			createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
-			createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
+			createBinFile(t, ggml.KV{"general.architecture": "test"}, nil),
+			createBinFile(t, ggml.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
 		),
 		),
 	})
 	})
 
 

+ 13 - 12
server/sched.go

@@ -18,6 +18,7 @@ import (
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -41,8 +42,8 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex
 	loadedMu sync.Mutex
 
 
-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
-	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	loadFn       func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int)
+	newServerFn  func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
 	getGpuFn     func() discover.GpuInfoList
 	getGpuFn     func() discover.GpuInfoList
 	getCpuFn     func() discover.GpuInfoList
 	getCpuFn     func() discover.GpuInfoList
 	reschedDelay time.Duration
 	reschedDelay time.Duration
@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 	}()
 }
 }
 
 
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 	if numParallel < 1 {
 		numParallel = 1
 		numParallel = 1
 	}
 	}
@@ -417,12 +418,12 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
 	if req.sessionDuration != nil {
 	if req.sessionDuration != nil {
 		sessionDuration = req.sessionDuration.Duration
 		sessionDuration = req.sessionDuration.Duration
 	}
 	}
-	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
+	llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
 	if err != nil {
 	if err != nil {
 		// some older models are not compatible with newer versions of llama.cpp
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
 		// show a generalized compatibility error until there is a better way to
 		// check for model compatibility
 		// check for model compatibility
-		if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
+		if errors.Is(err, ggml.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 		}
 		}
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
 // If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64
 	var estimatedVRAM uint64
 
 
 	var numParallelToTry []int
 	var numParallelToTry []int
@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
 						*numParallel = p
 						return []discover.GpuInfo{g}
 						return []discover.GpuInfo{g}
@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 		// Now try all the GPUs
 		// Now try all the GPUs
 		for _, p := range numParallelToTry {
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				*numParallel = p
 				*numParallel = p
 				return sgl
 				return sgl
@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 }
 }
 
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	if *numParallel <= 0 {
 	if *numParallel <= 0 {
 		*numParallel = 1
 		*numParallel = 1
 		req.opts.NumCtx = req.origNumCtx
 		req.opts.NumCtx = req.origNumCtx
@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
 	var bestEstimate uint64
 	var bestEstimate uint64
 	var bestFit int
 	var bestFit int
 	for i, gl := range byLibrary {
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+		_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
 		if estimatedVRAM > bestEstimate {
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestEstimate = estimatedVRAM
 			bestFit = i
 			bestFit = i
@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
 
 
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 // If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
-	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
+	estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		return nil
 		return nil

+ 19 - 18
server/sched_test.go

@@ -15,6 +15,7 @@ import (
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/fs/ggml"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	defer done()
 	defer done()
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
-	var ggml *llm.GGML // value not used in tests
+	var f *ggml.GGML // value not used in tests
 	req := &LlmRequest{
 	req := &LlmRequest{
 		ctx:             ctx,
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
 		model:           &Model{ModelPath: "foo"},
@@ -47,11 +48,11 @@ func TestLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	}
 	// Fail to load model first
 	// Fail to load model first
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 		return nil, errors.New("something failed to load model blah")
 	}
 	}
 	gpus := discover.GpuInfoList{}
 	gpus := discover.GpuInfoList{}
-	s.load(req, ggml, gpus, 0)
+	s.load(req, f, gpus, 0)
 	require.Empty(t, req.successCh)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
 	require.Len(t, req.errCh, 1)
 	s.loadedMu.Lock()
 	s.loadedMu.Lock()
@@ -61,10 +62,10 @@ func TestLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")
 	require.Contains(t, err.Error(), "this model may be incompatible")
 
 
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
-	s.load(req, ggml, gpus, 0)
+	s.load(req, f, gpus, 0)
 	select {
 	select {
 	case err := <-req.errCh:
 	case err := <-req.errCh:
 		require.NoError(t, err)
 		require.NoError(t, err)
@@ -78,7 +79,7 @@ func TestLoad(t *testing.T) {
 
 
 	req.model.ModelPath = "dummy_model_path"
 	req.model.ModelPath = "dummy_model_path"
 	server.waitResp = errors.New("wait failure")
 	server.waitResp = errors.New("wait failure")
-	s.load(req, ggml, gpus, 0)
+	s.load(req, f, gpus, 0)
 	select {
 	select {
 	case err := <-req.errCh:
 	case err := <-req.errCh:
 		require.Contains(t, err.Error(), "wait failure")
 		require.Contains(t, err.Error(), "wait failure")
@@ -99,10 +100,10 @@ type reqBundle struct {
 	ctxDone func()
 	ctxDone func()
 	srv     *mockLlm
 	srv     *mockLlm
 	req     *LlmRequest
 	req     *LlmRequest
-	ggml    *llm.GGML
+	f       *ggml.GGML
 }
 }
 
 
-func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 	return scenario.srv, nil
 }
 }
 
 
@@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	require.NoError(t, err)
 	require.NoError(t, err)
 	defer f.Close()
 	defer f.Close()
 
 
-	require.NoError(t, llm.WriteGGUF(f, llm.KV{
+	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
 		"general.architecture":          "llama",
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.embedding_length":        uint32(4096),
@@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []llm.Tensor{
+	}, []ggml.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}))
 	}))
@@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
 
 	fname := f.Name()
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
 	model := &Model{Name: modelName, ModelPath: fname}
-	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
+	b.f, err = llm.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 	require.NoError(t, err)
 
 
 	if duration == nil {
 	if duration == nil {
@@ -174,7 +175,7 @@ func TestRequestsSameModelSameRequest(t *testing.T) {
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
 	b.req.model = a.req.model
 	b.req.model = a.req.model
-	b.ggml = a.ggml
+	b.f = a.f
 
 
 	s.newServerFn = a.newServer
 	s.newServerFn = a.newServer
 	slog.Info("a")
 	slog.Info("a")
@@ -218,7 +219,7 @@ func TestRequestsSimpleReloadSameModel(t *testing.T) {
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
 	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
 	tmpModel := *a.req.model
 	tmpModel := *a.req.model
 	b.req.model = &tmpModel
 	b.req.model = &tmpModel
-	b.ggml = a.ggml
+	b.f = a.f
 
 
 	s.newServerFn = a.newServer
 	s.newServerFn = a.newServer
 	slog.Info("a")
 	slog.Info("a")
@@ -419,13 +420,13 @@ func TestExpireRunner(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 	}
 	}
 
 
-	var ggml *llm.GGML
+	var f *ggml.GGML
 	gpus := discover.GpuInfoList{}
 	gpus := discover.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
-	s.load(req, ggml, gpus, 0)
+	s.load(req, f, gpus, 0)
 
 
 	select {
 	select {
 	case err := <-req.errCh:
 	case err := <-req.errCh:
@@ -729,9 +730,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 	}
 	}
 	s.getCpuFn = getCpuFn
 	s.getCpuFn = getCpuFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		require.Len(t, gpus, 1)
 		require.Len(t, gpus, 1)
-		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
+		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
 	}
 	}
 	slog.Info("a")
 	slog.Info("a")
 	s.pendingReqCh <- a.req
 	s.pendingReqCh <- a.req

+ 2 - 2
template/template_test.go

@@ -14,7 +14,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fs/ggml"
 )
 )
 
 
 func TestNamed(t *testing.T) {
 func TestNamed(t *testing.T) {
@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
 
 
 		for k, v := range ss {
 		for k, v := range ss {
 			t.Run(k, func(t *testing.T) {
 			t.Run(k, func(t *testing.T) {
-				kv := llm.KV{"tokenizer.chat_template": v}
+				kv := ggml.KV{"tokenizer.chat_template": v}
 				s := kv.ChatTemplate()
 				s := kv.ChatTemplate()
 				r, err := Named(s)
 				r, err := Named(s)
 				if err != nil {
 				if err != nil {