2 months ago · 3241b45790
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -153,7 +153,7 @@ func (s Tensors) Items(prefix ...string) []*Tensor {
 
				 	return items
			
 
				 }
			
 
				 
			
 
				-func (ts Tensors) Layers() map[string]Layer {
			
 
				+func (ts Tensors) GroupLayers() map[string]Layer {
			
 
				 	layers := make(map[string]Layer)
			
 
				 	for _, t := range ts.items {
			
 
				 		parts := strings.Split(t.Name, ".")
			
@@ -377,22 +377,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 
				 	}, offset, nil
			
 
				 }
			
 
				 
			
 
				-func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
			
 
				-	embedding := llm.KV().EmbeddingLength()
			
 
				-	heads := llm.KV().HeadCount()
			
 
				-	headsKV := llm.KV().HeadCountKV()
			
 
				-	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
			
 
				+func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
			
 
				+	embedding := f.KV().EmbeddingLength()
			
 
				+	heads := f.KV().HeadCount()
			
 
				+	headsKV := f.KV().HeadCountKV()
			
 
				+	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
			
 
				 
			
 
				-	embeddingHeads := llm.KV().EmbeddingHeadCount()
			
 
				-	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
			
 
				-	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
			
 
				+	embeddingHeads := f.KV().EmbeddingHeadCount()
			
 
				+	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
			
 
				+	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
			
 
				 
			
 
				-	layers := llm.Tensors().Layers()
			
 
				+	layers := f.Tensors().GroupLayers()
			
 
				 
			
 
				 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
			
 
				-	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			
 
				+	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
			
 
				 
			
 
				-	switch llm.KV().Architecture() {
			
 
				+	switch f.KV().Architecture() {
			
 
				 	case "llama":
			
 
				 		fullOffload = max(
			
 
				 			4*batch*(1+4*embedding+context*(1+heads)),
			
@@ -407,7 +407,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 
			
 
				 		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
			
 
				 			// mixtral 8x22b
			
 
				-			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
			
 
				+			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
			
 
				 			partialOffload = max(
			
 
				 				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
			
 
				 				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
			
@@ -424,11 +424,11 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 	case "mllama":
			
 
				 		var visionTokens, tiles uint64 = 1601, 4
			
 
				 
			
 
				-		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
			
 
				+		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
			
 
				 			kv = headsKV *
			
 
				 				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
			
 
				 				(2* // sizeof(float16)
			
 
				-					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
			
 
				+					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
			
 
				 					context +
			
 
				 					4* // sizeof(float32)
			
 
				 						uint64(crossAttentionLayers.size)* // num cross attention layers
			
@@ -443,7 +443,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 		)
			
 
				 
			
 
				 		var ropeFreqsCount uint64
			
 
				-		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
			
 
				+		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
			
 
				 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
			
 
				 				ropeFreqsCount = ropeFreqsWeights.parameters()
			
 
				 			}
			
@@ -547,20 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 }
			
 
				 
			
 
				 // SupportsKVCacheType checks if the requested cache type is supported
			
 
				-func (llm GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				+func (f GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
			
 
				 }
			
 
				 
			
 
				 // SupportsFlashAttention checks if the model supports flash attention
			
 
				-func (llm GGML) SupportsFlashAttention() bool {
			
 
				-	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
			
 
				+func (f GGML) SupportsFlashAttention() bool {
			
 
				+	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
			
 
				 	if isEmbedding {
			
 
				 		return false
			
 
				 	}
			
 
				 
			
 
				 	// Check head counts match and are non-zero
			
 
				-	headCountK := llm.KV().EmbeddingHeadCountK()
			
 
				-	headCountV := llm.KV().EmbeddingHeadCountV()
			
 
				+	headCountK := f.KV().EmbeddingHeadCountK()
			
 
				+	headCountV := f.KV().EmbeddingHeadCountV()
			
 
				 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
			
 
				 }
			
 
				 
			
--- a/fs/ggml/ggml_test.go
+++ b/fs/ggml/ggml_test.go
@@ -0,0 +1,155 @@
 
				+package ggml
			
 
				+
			
 
				+import (
			
 
				+	"maps"
			
 
				+	"slices"
			
 
				+	"strings"
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/google/go-cmp/cmp"
			
 
				+)
			
 
				+
			
 
				+func TestTensorLayers(t *testing.T) {
			
 
				+	tensors := make(map[string]*Tensor)
			
 
				+	for _, name := range []string{
			
 
				+		"token_embd.weight",
			
 
				+		"blk.0.attn_k.weight",
			
 
				+		"blk.0.attn_output.weight",
			
 
				+		"blk.0.attn_q.weight",
			
 
				+		"blk.0.attn_v.weight",
			
 
				+		"blk.0.attn_norm.weight",
			
 
				+		"blk.0.ffn_down.weight",
			
 
				+		"blk.0.ffn_gate.weight",
			
 
				+		"blk.0.ffn_up.weight",
			
 
				+		"blk.0.ffn_norm.weight",
			
 
				+		"output_norm.weight",
			
 
				+		"mm.0.bias",
			
 
				+		"mm.0.weight",
			
 
				+		"v.blk.0.attn_k.weight",
			
 
				+		"v.blk.0.attn_output.weight",
			
 
				+		"v.blk.0.attn_q.weight",
			
 
				+		"v.blk.0.attn_v.weight",
			
 
				+		"v.blk.0.attn_norm.weight",
			
 
				+		"v.blk.0.ffn_down.weight",
			
 
				+		"v.blk.0.ffn_gate.weight",
			
 
				+		"v.blk.0.ffn_up.weight",
			
 
				+		"v.blk.0.ffn_norm.weight",
			
 
				+		"v.patch_embd.weight",
			
 
				+		"v.position_embd.gate",
			
 
				+		"v.position_embd.weight",
			
 
				+	} {
			
 
				+		tensors[name] = &Tensor{Name: name}
			
 
				+	}
			
 
				+
			
 
				+	cases := []struct {
			
 
				+		name  string
			
 
				+		items []*Tensor
			
 
				+		want  map[string]Layer
			
 
				+	}{
			
 
				+		{
			
 
				+			name: "text",
			
 
				+			items: slices.Collect(func(yield func(*Tensor) bool) {
			
 
				+				for k, v := range tensors {
			
 
				+					if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") {
			
 
				+						if !yield(v) {
			
 
				+							return
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}),
			
 
				+			want: map[string]Layer{
			
 
				+				"blk.0": {
			
 
				+					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
			
 
				+					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
			
 
				+					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
			
 
				+					"attn_output.weight": tensors["blk.0.attn_output.weight"],
			
 
				+					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
			
 
				+					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
			
 
				+					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
			
 
				+					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
			
 
				+					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
			
 
				+				},
			
 
				+				"token_embd":  {"weight": tensors["token_embd.weight"]},
			
 
				+				"output_norm": {"weight": tensors["output_norm.weight"]},
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "vision",
			
 
				+			items: slices.Collect(func(yield func(*Tensor) bool) {
			
 
				+				for k, v := range tensors {
			
 
				+					if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") {
			
 
				+						if !yield(v) {
			
 
				+							return
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+			}),
			
 
				+			want: map[string]Layer{
			
 
				+				"mm": {
			
 
				+					"0.bias":   tensors["mm.0.bias"],
			
 
				+					"0.weight": tensors["mm.0.weight"],
			
 
				+				},
			
 
				+				"v": {
			
 
				+					"blk.0.attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
			
 
				+					"blk.0.attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
			
 
				+					"blk.0.attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
			
 
				+					"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
			
 
				+					"blk.0.attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
			
 
				+					"blk.0.ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
			
 
				+					"blk.0.ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
			
 
				+					"blk.0.ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
			
 
				+					"blk.0.ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
			
 
				+					"patch_embd.weight":        tensors["v.patch_embd.weight"],
			
 
				+					"position_embd.gate":       tensors["v.position_embd.gate"],
			
 
				+					"position_embd.weight":     tensors["v.position_embd.weight"],
			
 
				+				},
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:  "vision and text",
			
 
				+			items: slices.Collect(maps.Values(tensors)),
			
 
				+			want: map[string]Layer{
			
 
				+				"blk.0": {
			
 
				+					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
			
 
				+					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
			
 
				+					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
			
 
				+					"attn_output.weight": tensors["blk.0.attn_output.weight"],
			
 
				+					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
			
 
				+					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
			
 
				+					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
			
 
				+					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
			
 
				+					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
			
 
				+				},
			
 
				+				"token_embd":  {"weight": tensors["token_embd.weight"]},
			
 
				+				"output_norm": {"weight": tensors["output_norm.weight"]},
			
 
				+				"mm": {
			
 
				+					"0.bias":   tensors["mm.0.bias"],
			
 
				+					"0.weight": tensors["mm.0.weight"],
			
 
				+				},
			
 
				+				"v": {
			
 
				+					"blk.0.attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
			
 
				+					"blk.0.attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
			
 
				+					"blk.0.attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
			
 
				+					"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
			
 
				+					"blk.0.attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
			
 
				+					"blk.0.ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
			
 
				+					"blk.0.ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
			
 
				+					"blk.0.ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
			
 
				+					"blk.0.ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
			
 
				+					"patch_embd.weight":        tensors["v.patch_embd.weight"],
			
 
				+					"position_embd.gate":       tensors["v.position_embd.gate"],
			
 
				+					"position_embd.weight":     tensors["v.position_embd.weight"],
			
 
				+				},
			
 
				+			},
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tt := range cases {
			
 
				+		t.Run(tt.name, func(t *testing.T) {
			
 
				+			got := Tensors{items: tt.items}.GroupLayers()
			
 
				+			if diff := cmp.Diff(got, tt.want); diff != "" {
			
 
				+				t.Errorf("unexpected layers (-got +want):\n%s", diff)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -32,9 +32,10 @@ const (
 
				 	fileTypeIQ1_S
			
 
				 	fileTypeIQ4_NL
			
 
				 	fileTypeIQ3_S
			
 
				+	fileTypeIQ3_M
			
 
				 	fileTypeIQ2_S
			
 
				-	fileTypeIQ4_XS
			
 
				 	fileTypeIQ2_M
			
 
				+	fileTypeIQ4_XS
			
 
				 	fileTypeIQ1_M
			
 
				 	fileTypeBF16
			
 
				 
			
@@ -93,12 +94,14 @@ func ParseFileType(s string) (fileType, error) {
 
				 		return fileTypeIQ4_NL, nil
			
 
				 	case "IQ3_S":
			
 
				 		return fileTypeIQ3_S, nil
			
 
				+	case "IQ3_M":
			
 
				+		return fileTypeIQ3_M, nil
			
 
				 	case "IQ2_S":
			
 
				 		return fileTypeIQ2_S, nil
			
 
				-	case "IQ4_XS":
			
 
				-		return fileTypeIQ4_XS, nil
			
 
				 	case "IQ2_M":
			
 
				 		return fileTypeIQ2_M, nil
			
 
				+	case "IQ4_XS":
			
 
				+		return fileTypeIQ4_XS, nil
			
 
				 	case "IQ1_M":
			
 
				 		return fileTypeIQ1_M, nil
			
 
				 	case "BF16":
			
@@ -160,6 +163,8 @@ func (t fileType) String() string {
 
				 		return "IQ4_NL"
			
 
				 	case fileTypeIQ3_S:
			
 
				 		return "IQ3_S"
			
 
				+	case fileTypeIQ3_M:
			
 
				+		return "IQ3_M"
			
 
				 	case fileTypeIQ2_S:
			
 
				 		return "IQ2_S"
			
 
				 	case fileTypeIQ4_XS:
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -116,7 +116,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
				 		opts.NumCtx = max(opts.NumCtx, 2048)
			
 
				 	}
			
 
				 
			
 
				-	layers := f.Tensors().Layers()
			
 
				+	layers := f.Tensors().GroupLayers()
			
 
				 	// add one layer worth of memory as a buffer
			
 
				 	if blk0, ok := layers["blk.0"]; ok {
			
 
				 		layerSize = blk0.Size()
			
@@ -410,7 +410,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 
				 		return 0, 0
			
 
				 	}
			
 
				 
			
 
				-	for _, layer := range ggml.Tensors().Layers() {
			
 
				+	for _, layer := range ggml.Tensors().GroupLayers() {
			
 
				 		weights += layer.Size()
			
 
				 	}
			
 
				 
			
@@ -431,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 
				 		headCount := kv("attention.head_count")
			
 
				 
			
 
				 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
			
 
				-		if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
			
 
				+		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
			
 
				 			numPatches++
			
 
				 		}