Browse Source

Move Go code out of llm package

Daniel Hiltgen 6 months ago
parent
commit
4e988ad5d6

+ 16 - 16
convert/convert.go

@@ -9,7 +9,7 @@ import (
 	"log/slog"
 	"log/slog"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type ModelParameters struct {
 type ModelParameters struct {
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 	} `json:"lora_parameters"`
 	} `json:"lora_parameters"`
 }
 }
 
 
-func (ModelParameters) KV(t *Tokenizer) llm.KV {
-	kv := llm.KV{
+func (ModelParameters) KV(t *Tokenizer) fileutils.KV {
+	kv := fileutils.KV{
 		"general.file_type":            uint32(1),
 		"general.file_type":            uint32(1),
 		"general.quantization_version": uint32(2),
 		"general.quantization_version": uint32(2),
 		"tokenizer.ggml.pre":           t.Pre,
 		"tokenizer.ggml.pre":           t.Pre,
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p AdapterParameters) KV() llm.KV {
+func (p AdapterParameters) KV() fileutils.KV {
 	var alpha float32
 	var alpha float32
 	if p.LoraParameters.Alpha == 0 {
 	if p.LoraParameters.Alpha == 0 {
 		alpha = float32(p.Alpha)
 		alpha = float32(p.Alpha)
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 		alpha = p.LoraParameters.Alpha
 		alpha = p.LoraParameters.Alpha
 	}
 	}
 
 
-	kv := llm.KV{
+	kv := fileutils.KV{
 		"adapter.lora.alpha": alpha,
 		"adapter.lora.alpha": alpha,
 		"adapter.type":       "lora",
 		"adapter.type":       "lora",
 		"general.file_type":  uint32(1),
 		"general.file_type":  uint32(1),
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 	}
 	}
 }
 }
 
 
-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (ModelParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
+	return fileutils.WriteGGUF(ws, kv, ts)
 }
 }
 
 
-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
-	return llm.WriteGGUF(ws, kv, ts)
+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv fileutils.KV, ts []fileutils.Tensor) error {
+	return fileutils.WriteGGUF(ws, kv, ts)
 }
 }
 
 
 type ModelConverter interface {
 type ModelConverter interface {
 	// KV maps parameters to LLM key-values
 	// KV maps parameters to LLM key-values
-	KV(*Tokenizer) llm.KV
+	KV(*Tokenizer) fileutils.KV
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []fileutils.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	Replacements() []string
@@ -99,7 +99,7 @@ type ModelConverter interface {
 	// specialTokenTypes returns any special token types the model uses
 	// specialTokenTypes returns any special token types the model uses
 	specialTokenTypes() []string
 	specialTokenTypes() []string
 	// writeFile writes the model to the provided io.WriteSeeker
 	// writeFile writes the model to the provided io.WriteSeeker
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
 }
 }
 
 
 type moreParser interface {
 type moreParser interface {
@@ -108,17 +108,17 @@ type moreParser interface {
 
 
 type AdapterConverter interface {
 type AdapterConverter interface {
 	// KV maps parameters to LLM key-values
 	// KV maps parameters to LLM key-values
-	KV(llm.KV) llm.KV
+	KV(fileutils.KV) fileutils.KV
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
-	Tensors([]Tensor) []llm.Tensor
+	Tensors([]Tensor) []fileutils.Tensor
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// Replacements returns a list of string pairs to replace in tensor names.
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
 	Replacements() []string
 	Replacements() []string
 
 
-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
+	writeFile(io.WriteSeeker, fileutils.KV, []fileutils.Tensor) error
 }
 }
 
 
-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV fileutils.KV) error {
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
 	if err != nil {
 	if err != nil {
 		return err
 		return err

+ 5 - 5
convert/convert_bert.go

@@ -8,7 +8,7 @@ import (
 	"slices"
 	"slices"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type bertModel struct {
 type bertModel struct {
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 	return nil
 	return nil
 }
 }
 
 
-func (p *bertModel) KV(t *Tokenizer) llm.KV {
+func (p *bertModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "bert"
 	kv["general.architecture"] = "bert"
 	kv["bert.attention.causal"] = false
 	kv["bert.attention.causal"] = false
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *bertModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		if slices.Contains([]string{
 		if slices.Contains([]string{
 			"embeddings.position_ids",
 			"embeddings.position_ids",
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 			continue
 			continue
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_gemma.go

@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type gemmaModel struct {
 type gemmaModel struct {
@@ -23,7 +23,7 @@ type gemmaModel struct {
 
 
 var _ ModelConverter = (*gemmaModel)(nil)
 var _ ModelConverter = (*gemmaModel)(nil)
 
 
-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
+func (p *gemmaModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma"
 	kv["general.architecture"] = "gemma"
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemmaModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 		if strings.HasSuffix(t.Name(), "_norm.weight") {
 			t.SetRepacker(p.addOne)
 			t.SetRepacker(p.addOne)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 2 - 2
convert/convert_gemma2.go

@@ -1,7 +1,7 @@
 package convert
 package convert
 
 
 import (
 import (
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type gemma2Model struct {
 type gemma2Model struct {
@@ -11,7 +11,7 @@ type gemma2Model struct {
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
 }
 }
 
 
-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
+func (p *gemma2Model) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "gemma2"
 	kv["general.architecture"] = "gemma2"
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings

+ 5 - 5
convert/convert_gemma2_adapter.go

@@ -6,7 +6,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type gemma2Adapter struct {
 type gemma2Adapter struct {
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
 
 
 var _ AdapterConverter = (*gemma2Adapter)(nil)
 var _ AdapterConverter = (*gemma2Adapter)(nil)
 
 
-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
+func (p *gemma2Adapter) KV(baseKV fileutils.KV) fileutils.KV {
 	kv := p.AdapterParameters.KV()
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "gemma2"
 	kv["general.architecture"] = "gemma2"
 	return kv
 	return kv
 }
 }
 
 
-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *gemma2Adapter) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		shape := t.Shape()
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 6 - 6
convert/convert_llama.go

@@ -9,7 +9,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type llamaModel struct {
 type llamaModel struct {
@@ -46,7 +46,7 @@ type llamaModel struct {
 
 
 var _ ModelConverter = (*llamaModel)(nil)
 var _ ModelConverter = (*llamaModel)(nil)
 
 
-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
+func (p *llamaModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "llama"
 	kv["general.architecture"] = "llama"
 	kv["llama.vocab_size"] = p.VocabSize
 	kv["llama.vocab_size"] = p.VocabSize
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaModel) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 
 
 	if p.RopeScaling.factors != nil {
 	if p.RopeScaling.factors != nil {
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     "rope_freqs.weight",
 			Name:     "rope_freqs.weight",
 			Kind:     0,
 			Kind:     0,
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_llama_adapter.go

@@ -7,7 +7,7 @@ import (
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor"
 	"github.com/pdevine/tensor/native"
 	"github.com/pdevine/tensor/native"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type llamaAdapter struct {
 type llamaAdapter struct {
@@ -18,7 +18,7 @@ type llamaAdapter struct {
 
 
 var _ AdapterConverter = (*llamaAdapter)(nil)
 var _ AdapterConverter = (*llamaAdapter)(nil)
 
 
-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
+func (p *llamaAdapter) KV(baseKV fileutils.KV) fileutils.KV {
 	kv := p.AdapterParameters.KV()
 	kv := p.AdapterParameters.KV()
 	kv["general.architecture"] = "llama"
 	kv["general.architecture"] = "llama"
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
-	var out []llm.Tensor
+func (p *llamaAdapter) Tensors(ts []Tensor) []fileutils.Tensor {
+	var out []fileutils.Tensor
 	for _, t := range ts {
 	for _, t := range ts {
 		shape := t.Shape()
 		shape := t.Shape()
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 			t.SetRepacker(p.repack)
 			t.SetRepacker(p.repack)
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    shape,
 			Shape:    shape,

+ 5 - 5
convert/convert_mixtral.go

@@ -6,7 +6,7 @@ import (
 	"slices"
 	"slices"
 	"strings"
 	"strings"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type mixtralModel struct {
 type mixtralModel struct {
@@ -15,7 +15,7 @@ type mixtralModel struct {
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
 }
 }
 
 
-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
+func (p *mixtralModel) KV(t *Tokenizer) fileutils.KV {
 	kv := p.llamaModel.KV(t)
 	kv := p.llamaModel.KV(t)
 
 
 	if p.NumLocalExperts > 0 {
 	if p.NumLocalExperts > 0 {
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
+func (p *mixtralModel) Tensors(ts []Tensor) []fileutils.Tensor {
 	oldnew := []string{
 	oldnew := []string{
 		"model.layers", "blk",
 		"model.layers", "blk",
 		"w1", "ffn_gate_exps",
 		"w1", "ffn_gate_exps",
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 		return true
 		return true
 	})
 	})
 
 
-	var out []llm.Tensor
+	var out []fileutils.Tensor
 	for n, e := range experts {
 	for n, e := range experts {
 		// TODO(mxyng): sanity check experts
 		// TODO(mxyng): sanity check experts
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     n,
 			Name:     n,
 			Kind:     e[0].Kind(),
 			Kind:     e[0].Kind(),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),

+ 7 - 7
convert/convert_phi3.go

@@ -8,7 +8,7 @@ import (
 	"strings"
 	"strings"
 	"sync"
 	"sync"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type phi3Model struct {
 type phi3Model struct {
@@ -37,7 +37,7 @@ type phi3Model struct {
 
 
 var _ ModelConverter = (*phi3Model)(nil)
 var _ ModelConverter = (*phi3Model)(nil)
 
 
-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
+func (p *phi3Model) KV(t *Tokenizer) fileutils.KV {
 	kv := p.ModelParameters.KV(t)
 	kv := p.ModelParameters.KV(t)
 	kv["general.architecture"] = "phi3"
 	kv["general.architecture"] = "phi3"
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 	return kv
 	return kv
 }
 }
 
 
-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
+func (p *phi3Model) Tensors(ts []Tensor) []fileutils.Tensor {
 	var addRopeFactors sync.Once
 	var addRopeFactors sync.Once
 
 
-	out := make([]llm.Tensor, 0, len(ts)+2)
+	out := make([]fileutils.Tensor, 0, len(ts)+2)
 	for _, t := range ts {
 	for _, t := range ts {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 		if strings.HasPrefix(t.Name(), "blk.0.") {
 			addRopeFactors.Do(func() {
 			addRopeFactors.Do(func() {
-				out = append(out, llm.Tensor{
+				out = append(out, fileutils.Tensor{
 					Name:     "rope_factors_long.weight",
 					Name:     "rope_factors_long.weight",
 					Kind:     0,
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
 					WriterTo: p.RopeScaling.LongFactor,
 					WriterTo: p.RopeScaling.LongFactor,
-				}, llm.Tensor{
+				}, fileutils.Tensor{
 					Name:     "rope_factors_short.weight",
 					Name:     "rope_factors_short.weight",
 					Kind:     0,
 					Kind:     0,
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 			})
 			})
 		}
 		}
 
 
-		out = append(out, llm.Tensor{
+		out = append(out, fileutils.Tensor{
 			Name:     t.Name(),
 			Name:     t.Name(),
 			Kind:     t.Kind(),
 			Kind:     t.Kind(),
 			Shape:    t.Shape(),
 			Shape:    t.Shape(),

+ 5 - 5
convert/convert_test.go

@@ -20,7 +20,7 @@ import (
 
 
 	"golang.org/x/exp/maps"
 	"golang.org/x/exp/maps"
 
 
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 type tensorData struct {
 type tensorData struct {
@@ -29,7 +29,7 @@ type tensorData struct {
 	Shape   []int  `json:"shape"`
 	Shape   []int  `json:"shape"`
 }
 }
 
 
-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
+func convertFull(t *testing.T, fsys fs.FS) (*os.File, fileutils.KV, *fileutils.Tensors) {
 	t.Helper()
 	t.Helper()
 
 
 	f, err := os.CreateTemp(t.TempDir(), "f16")
 	f, err := os.CreateTemp(t.TempDir(), "f16")
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	}
 	}
 	t.Cleanup(func() { r.Close() })
 	t.Cleanup(func() { r.Close() })
 
 
-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
+	m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 	return r, m.KV(), m.Tensors()
 	return r, m.KV(), m.Tensors()
 }
 }
 
 
-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
+func generateResultsJSON(t *testing.T, f *os.File, kv fileutils.KV, tensors *fileutils.Tensors) map[string]string {
 	actual := make(map[string]string)
 	actual := make(map[string]string)
 	for k, v := range kv {
 	for k, v := range kv {
 		if s, ok := v.(json.Marshaler); !ok {
 		if s, ok := v.(json.Marshaler); !ok {
@@ -330,7 +330,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			}
 			defer r.Close()
 			defer r.Close()
 
 
-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
+			m, _, err := fileutils.DecodeGGML(r, math.MaxInt)
 			if err != nil {
 			if err != nil {
 				t.Fatal(err)
 				t.Fatal(err)
 			}
 			}

+ 3 - 0
discover/README.md

@@ -0,0 +1,3 @@
+# `discover`
+
+This package is responsible for discovering information about the system and the capabilities to run LLM.  This includes GPU and CPU discovery so the optimal runner can be chosen for a given model.  The ollama scheduler relies on up-to-date available memory information, so this package provides the ability to refresh free memory as efficiently as possible.

+ 3 - 0
fileutils/README.md

@@ -0,0 +1,3 @@
+# `modelfile`
+
+This package provides utilities for loading and inspecting model files

+ 3 - 1
llm/filetype.go → fileutils/filetype.go

@@ -1,9 +1,11 @@
-package llm
+package fileutils
 
 
 import "fmt"
 import "fmt"
 
 
 type fileType uint32
 type fileType uint32
 
 
+// TODO this should map over to the GGML CGO enum type
+
 const (
 const (
 	fileTypeF32 fileType = iota
 	fileTypeF32 fileType = iota
 	fileTypeF16
 	fileTypeF16

+ 1 - 1
llm/ggla.go → fileutils/ggla.go

@@ -1,4 +1,4 @@
-package llm
+package fileutils
 
 
 import (
 import (
 	"encoding/binary"
 	"encoding/binary"

+ 22 - 1
llm/ggml.go → fileutils/ggml.go

@@ -1,10 +1,11 @@
-package llm
+package fileutils
 
 
 import (
 import (
 	"encoding/binary"
 	"encoding/binary"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
+	"os"
 	"slices"
 	"slices"
 	"strings"
 	"strings"
 	"sync"
 	"sync"
@@ -488,3 +489,23 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 
 
 	return
 	return
 }
 }
+
+// LoadModel will load a model from disk. The model must be in the GGML format.
+//
+// It collects array values for arrays with a size less than or equal to
+// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
+// the maxArraySize is negative, all arrays are collected.
+func LoadModel(model string, maxArraySize int) (*GGML, error) {
+	if _, err := os.Stat(model); err != nil {
+		return nil, err
+	}
+
+	f, err := os.Open(model)
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	ggml, _, err := DecodeGGML(f, maxArraySize)
+	return ggml, err
+}

+ 1 - 0
fileutils/ggml_test.go

@@ -0,0 +1 @@
+package fileutils

+ 1 - 1
llm/gguf.go → fileutils/gguf.go

@@ -1,4 +1,4 @@
-package llm
+package fileutils
 
 
 import (
 import (
 	"bytes"
 	"bytes"

+ 2 - 2
llm/memory.go → fileutils/memory.go

@@ -1,4 +1,4 @@
-package llm
+package fileutils
 
 
 import (
 import (
 	"fmt"
 	"fmt"
@@ -329,7 +329,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 	return estimate
 	return estimate
 }
 }
 
 
-func (m MemoryEstimate) log() {
+func (m MemoryEstimate) Log() {
 	overhead := envconfig.GpuOverhead()
 	overhead := envconfig.GpuOverhead()
 
 
 	log := slog.With()
 	log := slog.With()

+ 1 - 1
llm/memory_test.go → fileutils/memory_test.go

@@ -1,4 +1,4 @@
-package llm
+package fileutils
 
 
 import (
 import (
 	"bytes"
 	"bytes"

+ 0 - 1
llm/ggml_test.go

@@ -1 +0,0 @@
-package llm

+ 3 - 0
runners/README.md

@@ -0,0 +1,3 @@
+# `runners`
+
+Ollama uses a subprocess model to run one or more child processes to load the LLM.  On some platforms (Linux non-containerized, MacOS) these executables are carried as payloads inside the main executable via the ../build package.  Extraction and discovery of these runners at runtime is implemented in this package.  This package also provides the abstraction to communicate with these subprocesses. 

+ 33 - 0
runners/common.go

@@ -2,6 +2,7 @@ package runners
 
 
 import (
 import (
 	"compress/gzip"
 	"compress/gzip"
+	"context"
 	"errors"
 	"errors"
 	"fmt"
 	"fmt"
 	"io"
 	"io"
@@ -15,9 +16,11 @@ import (
 	"strings"
 	"strings"
 	"sync"
 	"sync"
 	"syscall"
 	"syscall"
+	"time"
 
 
 	"golang.org/x/sync/errgroup"
 	"golang.org/x/sync/errgroup"
 
 
+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 )
 )
@@ -31,6 +34,36 @@ var (
 	runnersDir = ""
 	runnersDir = ""
 )
 )
 
 
+type CompletionRequest struct {
+	Prompt  string
+	Format  string
+	Images  []ImageData
+	Options *api.Options
+}
+
+type CompletionResponse struct {
+	Content            string
+	DoneReason         string
+	Done               bool
+	PromptEvalCount    int
+	PromptEvalDuration time.Duration
+	EvalCount          int
+	EvalDuration       time.Duration
+}
+
+type LLMServer interface {
+	Ping(ctx context.Context) error
+	WaitUntilRunning(ctx context.Context) error
+	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
+	Embedding(ctx context.Context, input string) ([]float32, error)
+	Tokenize(ctx context.Context, content string) ([]int, error)
+	Detokenize(ctx context.Context, tokens []int) (string, error)
+	Close() error
+	EstimatedVRAM() uint64 // Total VRAM across all GPUs
+	EstimatedTotal() uint64
+	EstimatedVRAMByGPU(gpuID string) uint64
+}
+
 // Return the location where runners are stored
 // Return the location where runners are stored
 // If runners are payloads, this will either extract them
 // If runners are payloads, this will either extract them
 // or refresh them if any have disappeared due to tmp cleaners
 // or refresh them if any have disappeared due to tmp cleaners

+ 14 - 64
llm/server.go → runners/llama-server.go

@@ -1,4 +1,4 @@
-package llm
+package runners
 
 
 import (
 import (
 	"bufio"
 	"bufio"
@@ -28,24 +28,11 @@ import (
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/runners"
 )
 )
 
 
-type LlamaServer interface {
-	Ping(ctx context.Context) error
-	WaitUntilRunning(ctx context.Context) error
-	Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error
-	Embedding(ctx context.Context, input string) ([]float32, error)
-	Tokenize(ctx context.Context, content string) ([]int, error)
-	Detokenize(ctx context.Context, tokens []int) (string, error)
-	Close() error
-	EstimatedVRAM() uint64 // Total VRAM across all GPUs
-	EstimatedTotal() uint64
-	EstimatedVRAMByGPU(gpuID string) uint64
-}
-
 // llmServer is an instance of the llama.cpp server
 // llmServer is an instance of the llama.cpp server
 type llmServer struct {
 type llmServer struct {
 	port        int
 	port        int
@@ -58,7 +45,7 @@ type llmServer struct {
 	modelLock   sync.Mutex   // Temporary until we switch fully to Go server
 	modelLock   sync.Mutex   // Temporary until we switch fully to Go server
 	model       *llama.Model // If non-nil, the runner is a new Go server
 	model       *llama.Model // If non-nil, the runner is a new Go server
 
 
-	estimate    MemoryEstimate
+	estimate    fileutils.MemoryEstimate
 	totalLayers uint64
 	totalLayers uint64
 	// gpuCount     int
 	// gpuCount     int
 	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
 	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
@@ -68,32 +55,12 @@ type llmServer struct {
 	sem *semaphore.Weighted
 	sem *semaphore.Weighted
 }
 }
 
 
-// LoadModel will load a model from disk. The model must be in the GGML format.
-//
-// It collects array values for arrays with a size less than or equal to
-// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
-// the maxArraySize is negative, all arrays are collected.
-func LoadModel(model string, maxArraySize int) (*GGML, error) {
-	if _, err := os.Stat(model); err != nil {
-		return nil, err
-	}
-
-	f, err := os.Open(model)
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	ggml, _, err := DecodeGGML(f, maxArraySize)
-	return ggml, err
-}
-
 // NewLlamaServer will run a server for the given GPUs
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
 // The gpu list must be a single family.
-func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters, projectors []string, opts api.Options, numParallel int) (LLMServer, error) {
 	var err error
 	var err error
 	var cpuRunner string
 	var cpuRunner string
-	var estimate MemoryEstimate
+	var estimate fileutils.MemoryEstimate
 	var systemTotalMemory uint64
 	var systemTotalMemory uint64
 	var systemFreeMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
 	var systemSwapFreeMemory uint64
@@ -109,10 +76,10 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		gpus = discover.GetCPUInfo()
 		gpus = discover.GetCPUInfo()
 	}
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = runners.ServerForCpu()
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		cpuRunner = ServerForCpu()
+		estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 	} else {
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = fileutils.EstimateGPULayers(gpus, ggml, projectors, opts)
 
 
 		switch {
 		switch {
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
 		case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
@@ -121,7 +88,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			opts.NumGPU = 0
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = runners.ServerForCpu()
+			cpuRunner = ServerForCpu()
 			gpus = discover.GetCPUInfo()
 			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 			opts.NumGPU = estimate.Layers
@@ -139,7 +106,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		}
 		}
 	}
 	}
 
 
-	estimate.log()
+	estimate.Log()
 
 
 	// Loop through potential servers
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 	finalErr := errors.New("no suitable llama servers found")
@@ -148,12 +115,12 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
 	}
 
 
-	rDir, err := runners.Refresh(build.EmbedFS)
+	rDir, err := Refresh(build.EmbedFS)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
-	availableServers := runners.GetAvailableServers(rDir)
+	availableServers := GetAvailableServers(rDir)
 	if len(availableServers) == 0 {
 	if len(availableServers) == 0 {
 		return nil, finalErr
 		return nil, finalErr
 	}
 	}
@@ -161,7 +128,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	if cpuRunner != "" {
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 		servers = []string{cpuRunner}
 	} else {
 	} else {
-		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
 	}
 	demandLib := envconfig.LLMLibrary()
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
 	if demandLib != "" {
@@ -325,7 +292,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		_, err := os.Stat(server)
 		_, err := os.Stat(server)
 		if errors.Is(err, os.ErrNotExist) {
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			_, err = runners.Refresh(build.EmbedFS)
+			_, err = Refresh(build.EmbedFS)
 			if err != nil {
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
 				return nil, err
@@ -673,23 +640,6 @@ type completion struct {
 	}
 	}
 }
 }
 
 
-type CompletionRequest struct {
-	Prompt  string
-	Format  string
-	Images  []ImageData
-	Options *api.Options
-}
-
-type CompletionResponse struct {
-	Content            string
-	DoneReason         string
-	Done               bool
-	PromptEvalCount    int
-	PromptEvalDuration time.Duration
-	EvalCount          int
-	EvalDuration       time.Duration
-}
-
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 	if err := s.sem.Acquire(ctx, 1); err != nil {
 		slog.Error("Failed to acquire semaphore", "error", err)
 		slog.Error("Failed to acquire semaphore", "error", err)

+ 1 - 1
llm/status.go → runners/llama-status.go

@@ -1,4 +1,4 @@
-package llm
+package runners
 
 
 import (
 import (
 	"bytes"
 	"bytes"

+ 1 - 1
llm/llm_darwin.go → runners/llama_darwin.go

@@ -1,4 +1,4 @@
-package llm
+package runners
 
 
 import (
 import (
 	"syscall"
 	"syscall"

+ 1 - 1
llm/llm_linux.go → runners/llama_linux.go

@@ -1,4 +1,4 @@
-package llm
+package runners
 
 
 import (
 import (
 	"syscall"
 	"syscall"

+ 1 - 1
llm/llm_windows.go → runners/llama_windows.go

@@ -1,4 +1,4 @@
-package llm
+package runners
 
 
 import (
 import (
 	"syscall"
 	"syscall"

+ 4 - 4
server/images.go

@@ -25,9 +25,9 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/auth"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/errtypes"
@@ -91,7 +91,7 @@ func (m *Model) CheckCapabilities(caps ...Capability) error {
 			defer f.Close()
 			defer f.Close()
 
 
 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
 			// TODO(mxyng): decode the GGML into model to avoid doing this multiple times
-			ggml, _, err := llm.DecodeGGML(f, 0)
+			ggml, _, err := fileutils.DecodeGGML(f, 0)
 			if err != nil {
 			if err != nil {
 				slog.Error("couldn't decode ggml", "error", err)
 				slog.Error("couldn't decode ggml", "error", err)
 				continue
 				continue
@@ -431,7 +431,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
 					baseLayer.MediaType == "application/vnd.ollama.image.model" &&
 					baseLayer.GGML != nil &&
 					baseLayer.GGML != nil &&
 					baseLayer.GGML.Name() == "gguf" {
 					baseLayer.GGML.Name() == "gguf" {
-					want, err := llm.ParseFileType(quantization)
+					want, err := fileutils.ParseFileType(quantization)
 					if err != nil {
 					if err != nil {
 						return err
 						return err
 					}
 					}
@@ -467,7 +467,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 							return err
 							return err
 						}
 						}
 
 
-						ggml, _, err := llm.DecodeGGML(temp, 0)
+						ggml, _, err := fileutils.DecodeGGML(temp, 0)
 						if err != nil {
 						if err != nil {
 							return err
 							return err
 						}
 						}

+ 7 - 7
server/model.go

@@ -18,7 +18,7 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/convert"
 	"github.com/ollama/ollama/convert"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
 )
 )
@@ -27,7 +27,7 @@ var intermediateBlobs map[string]string = make(map[string]string)
 
 
 type layerGGML struct {
 type layerGGML struct {
 	Layer
 	Layer
-	*llm.GGML
+	*fileutils.GGML
 }
 }
 
 
 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
 func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
@@ -67,7 +67,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			}
 			defer blob.Close()
 			defer blob.Close()
 
 
-			ggml, _, err := llm.DecodeGGML(blob, 0)
+			ggml, _, err := fileutils.DecodeGGML(blob, 0)
 			if err != nil {
 			if err != nil {
 				return nil, err
 				return nil, err
 			}
 			}
@@ -112,7 +112,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 
 
 	switch command {
 	switch command {
 	case "adapter":
 	case "adapter":
-		var baseModel *llm.GGML
+		var baseModel *fileutils.GGML
 		for _, l := range baseLayers {
 		for _, l := range baseLayers {
 			if l.GGML != nil {
 			if l.GGML != nil {
 				baseModel = l.GGML
 				baseModel = l.GGML
@@ -150,7 +150,7 @@ func parseFromZipFile(_ context.Context, command string, baseLayers []*layerGGML
 	}
 	}
 	defer bin.Close()
 	defer bin.Close()
 
 
-	ggml, _, err := llm.DecodeGGML(bin, 0)
+	ggml, _, err := fileutils.DecodeGGML(bin, 0)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -184,7 +184,7 @@ func parseFromFile(ctx context.Context, command string, baseLayers []*layerGGML,
 
 
 	var offset int64
 	var offset int64
 	for offset < stat.Size() {
 	for offset < stat.Size() {
-		ggml, n, err := llm.DecodeGGML(file, 0)
+		ggml, n, err := fileutils.DecodeGGML(file, 0)
 		if errors.Is(err, io.EOF) {
 		if errors.Is(err, io.EOF) {
 			break
 			break
 		} else if err != nil {
 		} else if err != nil {
@@ -263,7 +263,7 @@ func detectContentType(r io.Reader) (string, error) {
 		return "", err
 		return "", err
 	}
 	}
 
 
-	if contentType := llm.DetectGGMLType(b.Bytes()); contentType != "" {
+	if contentType := fileutils.DetectGGMLType(b.Bytes()); contentType != "" {
 		return contentType, nil
 		return contentType, nil
 	}
 	}
 
 

+ 3 - 3
server/model_test.go

@@ -13,7 +13,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 )
 )
 
 
@@ -147,7 +147,7 @@ func TestParseFromFileFromLayer(t *testing.T) {
 		t.Fatalf("failed to open file: %v", err)
 		t.Fatalf("failed to open file: %v", err)
 	}
 	}
 	defer file.Close()
 	defer file.Close()
-	if err := llm.WriteGGUF(file, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+	if err := fileutils.WriteGGUF(file, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
 		t.Fatalf("failed to write gguf: %v", err)
 		t.Fatalf("failed to write gguf: %v", err)
 	}
 	}
 
 
@@ -200,7 +200,7 @@ func TestParseLayerFromCopy(t *testing.T) {
 	defer file2.Close()
 	defer file2.Close()
 
 
 	for range 5 {
 	for range 5 {
-		if err := llm.WriteGGUF(file2, llm.KV{"general.architecture": "gemma"}, []llm.Tensor{}); err != nil {
+		if err := fileutils.WriteGGUF(file2, fileutils.KV{"general.architecture": "gemma"}, []fileutils.Tensor{}); err != nil {
 			t.Fatalf("failed to write gguf: %v", err)
 			t.Fatalf("failed to write gguf: %v", err)
 		}
 		}
 	}
 	}

+ 4 - 4
server/prompt.go

@@ -10,7 +10,7 @@ import (
 	"strings"
 	"strings"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/server/imageproc"
 	"github.com/ollama/ollama/server/imageproc"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 )
 )
@@ -22,7 +22,7 @@ var errTooManyImages = errors.New("vision model only supports a single image per
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt accepts a list of messages and returns the prompt and images that should be used for the next chat turn.
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // chatPrompt truncates any messages that exceed the context window of the model, making sure to always include 1) the
 // latest message and 2) system messages
 // latest message and 2) system messages
-func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []llm.ImageData, _ error) {
+func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.Options, msgs []api.Message, tools []api.Tool) (prompt string, images []runners.ImageData, _ error) {
 	var system []api.Message
 	var system []api.Message
 
 
 	isMllama := checkMllamaModelFamily(m)
 	isMllama := checkMllamaModelFamily(m)
@@ -90,7 +90,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 					return "", nil, err
 					return "", nil, err
 				}
 				}
 
 
-				imgData := llm.ImageData{
+				imgData := runners.ImageData{
 					Data:          buf.Bytes(),
 					Data:          buf.Bytes(),
 					AspectRatioID: aspectRatioID,
 					AspectRatioID: aspectRatioID,
 				}
 				}
@@ -105,7 +105,7 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
 			prefix := ""
 			prefix := ""
 			prompt := msg.Content
 			prompt := msg.Content
 			for _, i := range msg.Images {
 			for _, i := range msg.Images {
-				imgData := llm.ImageData{
+				imgData := runners.ImageData{
 					ID:   len(images),
 					ID:   len(images),
 					Data: i,
 					Data: i,
 				}
 				}

+ 12 - 12
server/routes.go

@@ -29,7 +29,7 @@ import (
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/runners"
@@ -78,7 +78,7 @@ func modelOptions(model *Model, requestOpts map[string]interface{}) (api.Options
 
 
 // scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
 // scheduleRunner schedules a runner after validating inputs such as capabilities and model options.
 // It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
 // It returns the allocated runner, model instance, and consolidated options if successful and error otherwise.
-func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (llm.LlamaServer, *Model, *api.Options, error) {
+func (s *Server) scheduleRunner(ctx context.Context, name string, caps []Capability, requestOpts map[string]any, keepAlive *api.Duration) (runners.LLMServer, *Model, *api.Options, error) {
 	if name == "" {
 	if name == "" {
 		return nil, nil, nil, fmt.Errorf("model %w", errRequired)
 		return nil, nil, nil, fmt.Errorf("model %w", errRequired)
 	}
 	}
@@ -187,9 +187,9 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		return
 		return
 	}
 	}
 
 
-	images := make([]llm.ImageData, len(req.Images))
+	images := make([]runners.ImageData, len(req.Images))
 	for i := range req.Images {
 	for i := range req.Images {
-		images[i] = llm.ImageData{ID: i, Data: req.Images[i]}
+		images[i] = runners.ImageData{ID: i, Data: req.Images[i]}
 	}
 	}
 
 
 	prompt := req.Prompt
 	prompt := req.Prompt
@@ -255,12 +255,12 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 		// TODO (jmorganca): avoid building the response twice both here and below
 		// TODO (jmorganca): avoid building the response twice both here and below
 		var sb strings.Builder
 		var sb strings.Builder
 		defer close(ch)
 		defer close(ch)
-		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
+		if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
 			Prompt:  prompt,
 			Prompt:  prompt,
 			Images:  images,
 			Images:  images,
 			Format:  req.Format,
 			Format:  req.Format,
 			Options: opts,
 			Options: opts,
-		}, func(cr llm.CompletionResponse) {
+		}, func(cr runners.CompletionResponse) {
 			res := api.GenerateResponse{
 			res := api.GenerateResponse{
 				Model:      req.Model,
 				Model:      req.Model,
 				CreatedAt:  time.Now().UTC(),
 				CreatedAt:  time.Now().UTC(),
@@ -639,7 +639,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
 	}
 	}
 
 
 	if r.Path == "" && r.Modelfile == "" {
 	if r.Path == "" && r.Modelfile == "" {
-		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or modelfile are required"})
+		c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": "path or fileutils are required"})
 		return
 		return
 	}
 	}
 
 
@@ -647,7 +647,7 @@ func (s *Server) CreateHandler(c *gin.Context) {
 	if r.Path != "" && r.Modelfile == "" {
 	if r.Path != "" && r.Modelfile == "" {
 		f, err := os.Open(r.Path)
 		f, err := os.Open(r.Path)
 		if err != nil {
 		if err != nil {
-			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading modelfile: %s", err)})
+			c.AbortWithStatusJSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("error reading fileutils: %s", err)})
 			return
 			return
 		}
 		}
 		defer f.Close()
 		defer f.Close()
@@ -851,12 +851,12 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func getKVData(digest string, verbose bool) (llm.KV, error) {
+func getKVData(digest string, verbose bool) (fileutils.KV, error) {
 	maxArraySize := 0
 	maxArraySize := 0
 	if verbose {
 	if verbose {
 		maxArraySize = -1
 		maxArraySize = -1
 	}
 	}
-	kvData, err := llm.LoadModel(digest, maxArraySize)
+	kvData, err := fileutils.LoadModel(digest, maxArraySize)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -1436,12 +1436,12 @@ func (s *Server) ChatHandler(c *gin.Context) {
 	ch := make(chan any)
 	ch := make(chan any)
 	go func() {
 	go func() {
 		defer close(ch)
 		defer close(ch)
-		if err := r.Completion(c.Request.Context(), llm.CompletionRequest{
+		if err := r.Completion(c.Request.Context(), runners.CompletionRequest{
 			Prompt:  prompt,
 			Prompt:  prompt,
 			Images:  images,
 			Images:  images,
 			Format:  req.Format,
 			Format:  req.Format,
 			Options: opts,
 			Options: opts,
-		}, func(r llm.CompletionResponse) {
+		}, func(r runners.CompletionResponse) {
 			res := api.ChatResponse{
 			res := api.ChatResponse{
 				Model:      req.Model,
 				Model:      req.Model,
 				CreatedAt:  time.Now().UTC(),
 				CreatedAt:  time.Now().UTC(),

+ 4 - 4
server/routes_create_test.go

@@ -16,12 +16,12 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/gin-gonic/gin"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 var stream bool = false
 var stream bool = false
 
 
-func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
+func createBinFile(t *testing.T, kv map[string]any, ti []fileutils.Tensor) string {
 	t.Helper()
 	t.Helper()
 
 
 	f, err := os.CreateTemp(t.TempDir(), "")
 	f, err := os.CreateTemp(t.TempDir(), "")
@@ -30,7 +30,7 @@ func createBinFile(t *testing.T, kv map[string]any, ti []llm.Tensor) string {
 	}
 	}
 	defer f.Close()
 	defer f.Close()
 
 
-	if err := llm.WriteGGUF(f, kv, ti); err != nil {
+	if err := fileutils.WriteGGUF(f, kv, ti); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
@@ -581,7 +581,7 @@ func TestCreateDetectTemplate(t *testing.T) {
 	t.Run("matched", func(t *testing.T) {
 	t.Run("matched", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Name: "test",
 			Name: "test",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
 				"tokenizer.chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
 			}, nil)),
 			}, nil)),
 			Stream: &stream,
 			Stream: &stream,

+ 20 - 19
server/routes_generate_test.go

@@ -16,18 +16,19 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
+	"github.com/ollama/ollama/runners"
 )
 )
 
 
 type mockRunner struct {
 type mockRunner struct {
-	llm.LlamaServer
+	runners.LLMServer
 
 
 	// CompletionRequest is only valid until the next call to Completion
 	// CompletionRequest is only valid until the next call to Completion
-	llm.CompletionRequest
-	llm.CompletionResponse
+	runners.CompletionRequest
+	runners.CompletionResponse
 }
 }
 
 
-func (m *mockRunner) Completion(_ context.Context, r llm.CompletionRequest, fn func(r llm.CompletionResponse)) error {
+func (m *mockRunner) Completion(_ context.Context, r runners.CompletionRequest, fn func(r runners.CompletionResponse)) error {
 	m.CompletionRequest = r
 	m.CompletionRequest = r
 	fn(m.CompletionResponse)
 	fn(m.CompletionResponse)
 	return nil
 	return nil
@@ -41,8 +42,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 	return
 }
 }
 
 
-func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *fileutils.GGML, []string, []string, api.Options, int) (runners.LLMServer, error) {
+	return func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, projectors, system []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 		return mock, nil
 		return mock, nil
 	}
 	}
 }
 }
@@ -51,7 +52,7 @@ func TestGenerateChat(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 	gin.SetMode(gin.TestMode)
 
 
 	mock := mockRunner{
 	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
+		CompletionResponse: runners.CompletionResponse{
 			Done:               true,
 			Done:               true,
 			DoneReason:         "stop",
 			DoneReason:         "stop",
 			PromptEvalCount:    1,
 			PromptEvalCount:    1,
@@ -72,7 +73,7 @@ func TestGenerateChat(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{
@@ -91,7 +92,7 @@ func TestGenerateChat(t *testing.T) {
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
-`, createBinFile(t, llm.KV{
+`, createBinFile(t, fileutils.KV{
 			"general.architecture":          "llama",
 			"general.architecture":          "llama",
 			"llama.block_count":             uint32(1),
 			"llama.block_count":             uint32(1),
 			"llama.context_length":          uint32(8192),
 			"llama.context_length":          uint32(8192),
@@ -101,7 +102,7 @@ func TestGenerateChat(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []fileutils.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -146,10 +147,10 @@ func TestGenerateChat(t *testing.T) {
 	t.Run("missing capabilities chat", func(t *testing.T) {
 	t.Run("missing capabilities chat", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
 				"general.architecture": "bert",
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []fileutils.Tensor{})),
 			Stream: &stream,
 			Stream: &stream,
 		})
 		})
 
 
@@ -349,7 +350,7 @@ func TestGenerate(t *testing.T) {
 	gin.SetMode(gin.TestMode)
 	gin.SetMode(gin.TestMode)
 
 
 	mock := mockRunner{
 	mock := mockRunner{
-		CompletionResponse: llm.CompletionResponse{
+		CompletionResponse: runners.CompletionResponse{
 			Done:               true,
 			Done:               true,
 			DoneReason:         "stop",
 			DoneReason:         "stop",
 			PromptEvalCount:    1,
 			PromptEvalCount:    1,
@@ -370,7 +371,7 @@ func TestGenerate(t *testing.T) {
 			getGpuFn:      discover.GetGPUInfo,
 			getGpuFn:      discover.GetGPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{
@@ -389,7 +390,7 @@ func TestGenerate(t *testing.T) {
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .System }}System: {{ .System }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Prompt }}User: {{ .Prompt }} {{ end }}
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
 {{- if .Response }}Assistant: {{ .Response }} {{ end }}"""
-`, createBinFile(t, llm.KV{
+`, createBinFile(t, fileutils.KV{
 			"general.architecture":          "llama",
 			"general.architecture":          "llama",
 			"llama.block_count":             uint32(1),
 			"llama.block_count":             uint32(1),
 			"llama.context_length":          uint32(8192),
 			"llama.context_length":          uint32(8192),
@@ -399,7 +400,7 @@ func TestGenerate(t *testing.T) {
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.tokens":         []string{""},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.scores":         []float32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
 			"tokenizer.ggml.token_type":     []int32{0},
-		}, []llm.Tensor{
+		}, []fileutils.Tensor{
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "token_embd.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.attn_norm.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
 			{Name: "blk.0.ffn_down.weight", Shape: []uint64{1}, WriterTo: bytes.NewReader(make([]byte, 4))},
@@ -444,10 +445,10 @@ func TestGenerate(t *testing.T) {
 	t.Run("missing capabilities generate", func(t *testing.T) {
 	t.Run("missing capabilities generate", func(t *testing.T) {
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 		w := createRequest(t, s.CreateHandler, api.CreateRequest{
 			Model: "bert",
 			Model: "bert",
-			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, llm.KV{
+			Modelfile: fmt.Sprintf("FROM %s", createBinFile(t, fileutils.KV{
 				"general.architecture": "bert",
 				"general.architecture": "bert",
 				"bert.pooling_type":    uint32(0),
 				"bert.pooling_type":    uint32(0),
-			}, []llm.Tensor{})),
+			}, []fileutils.Tensor{})),
 			Stream: &stream,
 			Stream: &stream,
 		})
 		})
 
 

+ 5 - 5
server/routes_test.go

@@ -16,7 +16,7 @@ import (
 	"testing"
 	"testing"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
@@ -83,14 +83,14 @@ func Test_Routes(t *testing.T) {
 		fname := createTestFile(t, "ollama-model")
 		fname := createTestFile(t, "ollama-model")
 
 
 		r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
 		r := strings.NewReader(fmt.Sprintf("FROM %s\nPARAMETER seed 42\nPARAMETER top_p 0.9\nPARAMETER stop foo\nPARAMETER stop bar", fname))
-		modelfile, err := parser.ParseFile(r)
+		fileutils, err := parser.ParseFile(r)
 		if err != nil {
 		if err != nil {
 			t.Fatalf("failed to parse file: %v", err)
 			t.Fatalf("failed to parse file: %v", err)
 		}
 		}
 		fn := func(resp api.ProgressResponse) {
 		fn := func(resp api.ProgressResponse) {
 			t.Logf("Status: %s", resp.Status)
 			t.Logf("Status: %s", resp.Status)
 		}
 		}
-		err = CreateModel(context.TODO(), model.ParseName(name), "", "", modelfile, fn)
+		err = CreateModel(context.TODO(), model.ParseName(name), "", "", fileutils, fn)
 		if err != nil {
 		if err != nil {
 			t.Fatalf("failed to create model: %v", err)
 			t.Fatalf("failed to create model: %v", err)
 		}
 		}
@@ -561,8 +561,8 @@ func TestShow(t *testing.T) {
 		Name: "show-model",
 		Name: "show-model",
 		Modelfile: fmt.Sprintf(
 		Modelfile: fmt.Sprintf(
 			"FROM %s\nFROM %s",
 			"FROM %s\nFROM %s",
-			createBinFile(t, llm.KV{"general.architecture": "test"}, nil),
-			createBinFile(t, llm.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
+			createBinFile(t, fileutils.KV{"general.architecture": "test"}, nil),
+			createBinFile(t, fileutils.KV{"general.type": "projector", "general.architecture": "clip"}, nil),
 		),
 		),
 	})
 	})
 
 

+ 16 - 15
server/sched.go

@@ -17,8 +17,9 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/runners"
 )
 )
 
 
 type LlmRequest struct {
 type LlmRequest struct {
@@ -41,8 +42,8 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex
 	loadedMu sync.Mutex
 
 
-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
-	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	loadFn       func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int)
+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error)
 	getGpuFn     func() discover.GpuInfoList
 	getGpuFn     func() discover.GpuInfoList
 	getCpuFn     func() discover.GpuInfoList
 	getCpuFn     func() discover.GpuInfoList
 	reschedDelay time.Duration
 	reschedDelay time.Duration
@@ -68,7 +69,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		expiredCh:     make(chan *runnerRef, maxQueue),
 		expiredCh:     make(chan *runnerRef, maxQueue),
 		unloadedCh:    make(chan interface{}, maxQueue),
 		unloadedCh:    make(chan interface{}, maxQueue),
 		loaded:        make(map[string]*runnerRef),
 		loaded:        make(map[string]*runnerRef),
-		newServerFn:   llm.NewLlamaServer,
+		newServerFn:   runners.NewLlamaServer,
 		getGpuFn:      discover.GetGPUInfo,
 		getGpuFn:      discover.GetGPUInfo,
 		getCpuFn:      discover.GetCPUInfo,
 		getCpuFn:      discover.GetCPUInfo,
 		reschedDelay:  250 * time.Millisecond,
 		reschedDelay:  250 * time.Millisecond,
@@ -187,7 +188,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 					}
 					}
 
 
 					// Load model for fitting
 					// Load model for fitting
-					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
+					ggml, err := fileutils.LoadModel(pending.model.ModelPath, 0)
 					if err != nil {
 					if err != nil {
 						pending.errCh <- err
 						pending.errCh <- err
 						break
 						break
@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 	}()
 }
 }
 
 
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
+func (s *Scheduler) load(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 	if numParallel < 1 {
 		numParallel = 1
 		numParallel = 1
 	}
 	}
@@ -422,7 +423,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
 		// some older models are not compatible with newer versions of llama.cpp
 		// some older models are not compatible with newer versions of llama.cpp
 		// show a generalized compatibility error until there is a better way to
 		// show a generalized compatibility error until there is a better way to
 		// check for model compatibility
 		// check for model compatibility
-		if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
+		if errors.Is(err, fileutils.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 			err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
 		}
 		}
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
 		slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
@@ -540,7 +541,7 @@ type runnerRef struct {
 	refCount uint // prevent unloading if > 0
 	refCount uint // prevent unloading if > 0
 	// unloading bool      // set to true when we are trying to unload the runner
 	// unloading bool      // set to true when we are trying to unload the runner
 
 
-	llama          llm.LlamaServer
+	llama          runners.LLMServer
 	loading        bool                 // True only during initial load, then false forever
 	loading        bool                 // True only during initial load, then false forever
 	gpus           discover.GpuInfoList // Recorded at time of provisioning
 	gpus           discover.GpuInfoList // Recorded at time of provisioning
 	estimatedVRAM  uint64
 	estimatedVRAM  uint64
@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64
 	var estimatedVRAM uint64
 
 
 	var numParallelToTry []int
 	var numParallelToTry []int
@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = fileutils.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
 						*numParallel = p
 						return []discover.GpuInfo{g}
 						return []discover.GpuInfo{g}
@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 		// Now try all the GPUs
 		// Now try all the GPUs
 		for _, p := range numParallelToTry {
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
-			if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+			if ok, estimatedVRAM = fileutils.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
 				*numParallel = p
 				*numParallel = p
 				return sgl
 				return sgl
@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
 }
 }
 
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	if *numParallel <= 0 {
 	if *numParallel <= 0 {
 		*numParallel = 1
 		*numParallel = 1
 		req.opts.NumCtx = req.origNumCtx
 		req.opts.NumCtx = req.origNumCtx
@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
 	var bestEstimate uint64
 	var bestEstimate uint64
 	var bestFit int
 	var bestFit int
 	for i, gl := range byLibrary {
 	for i, gl := range byLibrary {
-		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
+		_, estimatedVRAM := fileutils.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
 		if estimatedVRAM > bestEstimate {
 		if estimatedVRAM > bestEstimate {
 			bestEstimate = estimatedVRAM
 			bestEstimate = estimatedVRAM
 			bestFit = i
 			bestFit = i
@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
 
 
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 // If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
-	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
+	estimate := fileutils.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
 		return nil
 		return nil

+ 14 - 13
server/sched_test.go

@@ -14,8 +14,9 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/discover"
+	"github.com/ollama/ollama/fileutils"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/runners"
 )
 )
 
 
 func TestMain(m *testing.M) {
 func TestMain(m *testing.M) {
@@ -37,7 +38,7 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
 	defer done()
 	defer done()
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
-	var ggml *llm.GGML // value not used in tests
+	var ggml *fileutils.GGML // value not used in tests
 	req := &LlmRequest{
 	req := &LlmRequest{
 		ctx:             ctx,
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},
 		model:           &Model{ModelPath: "foo"},
@@ -47,7 +48,7 @@ func TestLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	}
 	// Fail to load model first
 	// Fail to load model first
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 		return nil, errors.New("something failed to load model blah")
 		return nil, errors.New("something failed to load model blah")
 	}
 	}
 	gpus := discover.GpuInfoList{}
 	gpus := discover.GpuInfoList{}
@@ -61,7 +62,7 @@ func TestLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")
 	require.Contains(t, err.Error(), "this model may be incompatible")
 
 
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
 	s.load(req, ggml, gpus, 0)
 	s.load(req, ggml, gpus, 0)
@@ -99,10 +100,10 @@ type reqBundle struct {
 	ctxDone func()
 	ctxDone func()
 	srv     *mockLlm
 	srv     *mockLlm
 	req     *LlmRequest
 	req     *LlmRequest
-	ggml    *llm.GGML
+	ggml    *fileutils.GGML
 }
 }
 
 
-func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 	return scenario.srv, nil
 	return scenario.srv, nil
 }
 }
 
 
@@ -115,7 +116,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	require.NoError(t, err)
 	require.NoError(t, err)
 	defer f.Close()
 	defer f.Close()
 
 
-	require.NoError(t, llm.WriteGGUF(f, llm.KV{
+	require.NoError(t, fileutils.WriteGGUF(f, fileutils.KV{
 		"general.architecture":          "llama",
 		"general.architecture":          "llama",
 		"llama.context_length":          uint32(32),
 		"llama.context_length":          uint32(32),
 		"llama.embedding_length":        uint32(4096),
 		"llama.embedding_length":        uint32(4096),
@@ -125,7 +126,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.tokens":         []string{" "},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.scores":         []float32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
 		"tokenizer.ggml.token_type":     []int32{0},
-	}, []llm.Tensor{
+	}, []fileutils.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
 	}))
 	}))
@@ -133,7 +134,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
 
 	fname := f.Name()
 	fname := f.Name()
 	model := &Model{Name: modelName, ModelPath: fname}
 	model := &Model{Name: modelName, ModelPath: fname}
-	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
+	b.ggml, err = fileutils.LoadModel(model.ModelPath, 0)
 	require.NoError(t, err)
 	require.NoError(t, err)
 
 
 	if duration == nil {
 	if duration == nil {
@@ -419,10 +420,10 @@ func TestExpireRunner(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
 	}
 	}
 
 
-	var ggml *llm.GGML
+	var ggml *fileutils.GGML
 	gpus := discover.GpuInfoList{}
 	gpus := discover.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
 	s.load(req, ggml, gpus, 0)
 	s.load(req, ggml, gpus, 0)
@@ -729,7 +730,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 	}
 	}
 	s.getCpuFn = getCpuFn
 	s.getCpuFn = getCpuFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error) {
 		require.Len(t, gpus, 1)
 		require.Len(t, gpus, 1)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 	}
 	}
@@ -768,7 +769,7 @@ type mockLlm struct {
 
 
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
 func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
 func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
-func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
+func (s *mockLlm) Completion(ctx context.Context, req runners.CompletionRequest, fn func(runners.CompletionResponse)) error {
 	return s.completionResp
 	return s.completionResp
 }
 }
 
 

+ 2 - 2
template/template_test.go

@@ -14,7 +14,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/llm"
+	"github.com/ollama/ollama/fileutils"
 )
 )
 
 
 func TestNamed(t *testing.T) {
 func TestNamed(t *testing.T) {
@@ -33,7 +33,7 @@ func TestNamed(t *testing.T) {
 
 
 		for k, v := range ss {
 		for k, v := range ss {
 			t.Run(k, func(t *testing.T) {
 			t.Run(k, func(t *testing.T) {
-				kv := llm.KV{"tokenizer.chat_template": v}
+				kv := fileutils.KV{"tokenizer.chat_template": v}
 				s := kv.ChatTemplate()
 				s := kv.ChatTemplate()
 				r, err := Named(s)
 				r, err := Named(s)
 				if err != nil {
 				if err != nil {