Michael Yang 2 months ago
parent
commit
3241b45790
4 changed files with 187 additions and 27 deletions
  1. 21 21
      fs/ggml/ggml.go
  2. 155 0
      fs/ggml/ggml_test.go
  3. 8 3
      fs/ggml/type.go
  4. 3 3
      llm/memory.go

+ 21 - 21
fs/ggml/ggml.go

@@ -153,7 +153,7 @@ func (s Tensors) Items(prefix ...string) []*Tensor {
 	return items
 }
 
-func (ts Tensors) Layers() map[string]Layer {
+func (ts Tensors) GroupLayers() map[string]Layer {
 	layers := make(map[string]Layer)
 	for _, t := range ts.items {
 		parts := strings.Split(t.Name, ".")
@@ -377,22 +377,22 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	}, offset, nil
 }
 
-func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
-	embedding := llm.KV().EmbeddingLength()
-	heads := llm.KV().HeadCount()
-	headsKV := llm.KV().HeadCountKV()
-	vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
+func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialOffload, fullOffload uint64) {
+	embedding := f.KV().EmbeddingLength()
+	heads := f.KV().HeadCount()
+	headsKV := f.KV().HeadCountKV()
+	vocab := uint64(f.KV()["tokenizer.ggml.tokens"].(*array).size)
 
-	embeddingHeads := llm.KV().EmbeddingHeadCount()
-	embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
-	embeddingHeadsV := llm.KV().EmbeddingHeadCountV()
+	embeddingHeads := f.KV().EmbeddingHeadCount()
+	embeddingHeadsK := f.KV().EmbeddingHeadCountK()
+	embeddingHeadsV := f.KV().EmbeddingHeadCountV()
 
-	layers := llm.Tensors().Layers()
+	layers := f.Tensors().GroupLayers()
 
 	bytesPerElement := kvCacheBytesPerElement(kvCacheType)
-	kv = uint64(float64(context*llm.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
+	kv = uint64(float64(context*f.KV().BlockCount()*(embeddingHeadsK+embeddingHeadsV)*headsKV) * bytesPerElement)
 
-	switch llm.KV().Architecture() {
+	switch f.KV().Architecture() {
 	case "llama":
 		fullOffload = max(
 			4*batch*(1+4*embedding+context*(1+heads)),
@@ -407,7 +407,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
 		if ffnGateExpsWeight, ok := layers["blk.0"]["ffn_gate_exps.weight"]; ok {
 			// mixtral 8x22b
-			ff := uint64(llm.KV()["llama.feed_forward_length"].(uint32))
+			ff := uint64(f.KV()["llama.feed_forward_length"].(uint32))
 			partialOffload = max(
 				3*ffnGateExpsWeight.Size()+4*batch*(2*ff+headsKV+embedding+context+embeddingHeads*headsKV),
 				4*(context*batch*heads+context*embeddingHeads*headsKV+batch*1024+embeddingHeads*headsKV*batch),
@@ -424,11 +424,11 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 	case "mllama":
 		var visionTokens, tiles uint64 = 1601, 4
 
-		if crossAttentionLayers, ok := llm.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
+		if crossAttentionLayers, ok := f.KV()["mllama.attention.cross_attention_layers"].(*array); ok {
 			kv = headsKV *
 				(embeddingHeadsK + embeddingHeadsV) * // one for K, one for V
 				(2* // sizeof(float16)
-					(llm.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
+					(f.KV().BlockCount()-uint64(crossAttentionLayers.size))* // num non-cross attention layers
 					context +
 					4* // sizeof(float32)
 						uint64(crossAttentionLayers.size)* // num cross attention layers
@@ -443,7 +443,7 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 		)
 
 		var ropeFreqsCount uint64
-		if ropeFreqs, ok := llm.Tensors().Layers()["rope_freqs"]; ok {
+		if ropeFreqs, ok := f.Tensors().GroupLayers()["rope_freqs"]; ok {
 			if ropeFreqsWeights, ok := ropeFreqs["weights"]; ok {
 				ropeFreqsCount = ropeFreqsWeights.parameters()
 			}
@@ -547,20 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 }
 
 // SupportsKVCacheType checks if the requested cache type is supported
-func (llm GGML) SupportsKVCacheType(cacheType string) bool {
+func (f GGML) SupportsKVCacheType(cacheType string) bool {
 	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
 }
 
 // SupportsFlashAttention checks if the model supports flash attention
-func (llm GGML) SupportsFlashAttention() bool {
-	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
+func (f GGML) SupportsFlashAttention() bool {
+	_, isEmbedding := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]
 	if isEmbedding {
 		return false
 	}
 
 	// Check head counts match and are non-zero
-	headCountK := llm.KV().EmbeddingHeadCountK()
-	headCountV := llm.KV().EmbeddingHeadCountV()
+	headCountK := f.KV().EmbeddingHeadCountK()
+	headCountV := f.KV().EmbeddingHeadCountV()
 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
 }
 

+ 155 - 0
fs/ggml/ggml_test.go

@@ -0,0 +1,155 @@
+package ggml
+
+import (
+	"maps"
+	"slices"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestTensorLayers(t *testing.T) {
+	tensors := make(map[string]*Tensor)
+	for _, name := range []string{
+		"token_embd.weight",
+		"blk.0.attn_k.weight",
+		"blk.0.attn_output.weight",
+		"blk.0.attn_q.weight",
+		"blk.0.attn_v.weight",
+		"blk.0.attn_norm.weight",
+		"blk.0.ffn_down.weight",
+		"blk.0.ffn_gate.weight",
+		"blk.0.ffn_up.weight",
+		"blk.0.ffn_norm.weight",
+		"output_norm.weight",
+		"mm.0.bias",
+		"mm.0.weight",
+		"v.blk.0.attn_k.weight",
+		"v.blk.0.attn_output.weight",
+		"v.blk.0.attn_q.weight",
+		"v.blk.0.attn_v.weight",
+		"v.blk.0.attn_norm.weight",
+		"v.blk.0.ffn_down.weight",
+		"v.blk.0.ffn_gate.weight",
+		"v.blk.0.ffn_up.weight",
+		"v.blk.0.ffn_norm.weight",
+		"v.patch_embd.weight",
+		"v.position_embd.gate",
+		"v.position_embd.weight",
+	} {
+		tensors[name] = &Tensor{Name: name}
+	}
+
+	cases := []struct {
+		name  string
+		items []*Tensor
+		want  map[string]Layer
+	}{
+		{
+			name: "text",
+			items: slices.Collect(func(yield func(*Tensor) bool) {
+				for k, v := range tensors {
+					if !strings.HasPrefix(k, "mm.") && !strings.HasPrefix(k, "v.") {
+						if !yield(v) {
+							return
+						}
+					}
+				}
+			}),
+			want: map[string]Layer{
+				"blk.0": {
+					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
+					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
+					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
+					"attn_output.weight": tensors["blk.0.attn_output.weight"],
+					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
+					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
+					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
+					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
+					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
+				},
+				"token_embd":  {"weight": tensors["token_embd.weight"]},
+				"output_norm": {"weight": tensors["output_norm.weight"]},
+			},
+		},
+		{
+			name: "vision",
+			items: slices.Collect(func(yield func(*Tensor) bool) {
+				for k, v := range tensors {
+					if strings.HasPrefix(k, "mm.") || strings.HasPrefix(k, "v.") {
+						if !yield(v) {
+							return
+						}
+					}
+				}
+			}),
+			want: map[string]Layer{
+				"mm": {
+					"0.bias":   tensors["mm.0.bias"],
+					"0.weight": tensors["mm.0.weight"],
+				},
+				"v": {
+					"blk.0.attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
+					"blk.0.attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
+					"blk.0.attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
+					"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
+					"blk.0.attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
+					"blk.0.ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
+					"blk.0.ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
+					"blk.0.ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
+					"blk.0.ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
+					"patch_embd.weight":        tensors["v.patch_embd.weight"],
+					"position_embd.gate":       tensors["v.position_embd.gate"],
+					"position_embd.weight":     tensors["v.position_embd.weight"],
+				},
+			},
+		},
+		{
+			name:  "vision and text",
+			items: slices.Collect(maps.Values(tensors)),
+			want: map[string]Layer{
+				"blk.0": {
+					"attn_k.weight":      tensors["blk.0.attn_k.weight"],
+					"attn_q.weight":      tensors["blk.0.attn_q.weight"],
+					"attn_v.weight":      tensors["blk.0.attn_v.weight"],
+					"attn_output.weight": tensors["blk.0.attn_output.weight"],
+					"attn_norm.weight":   tensors["blk.0.attn_norm.weight"],
+					"ffn_down.weight":    tensors["blk.0.ffn_down.weight"],
+					"ffn_gate.weight":    tensors["blk.0.ffn_gate.weight"],
+					"ffn_up.weight":      tensors["blk.0.ffn_up.weight"],
+					"ffn_norm.weight":    tensors["blk.0.ffn_norm.weight"],
+				},
+				"token_embd":  {"weight": tensors["token_embd.weight"]},
+				"output_norm": {"weight": tensors["output_norm.weight"]},
+				"mm": {
+					"0.bias":   tensors["mm.0.bias"],
+					"0.weight": tensors["mm.0.weight"],
+				},
+				"v": {
+					"blk.0.attn_k.weight":      tensors["v.blk.0.attn_k.weight"],
+					"blk.0.attn_q.weight":      tensors["v.blk.0.attn_q.weight"],
+					"blk.0.attn_v.weight":      tensors["v.blk.0.attn_v.weight"],
+					"blk.0.attn_output.weight": tensors["v.blk.0.attn_output.weight"],
+					"blk.0.attn_norm.weight":   tensors["v.blk.0.attn_norm.weight"],
+					"blk.0.ffn_down.weight":    tensors["v.blk.0.ffn_down.weight"],
+					"blk.0.ffn_gate.weight":    tensors["v.blk.0.ffn_gate.weight"],
+					"blk.0.ffn_up.weight":      tensors["v.blk.0.ffn_up.weight"],
+					"blk.0.ffn_norm.weight":    tensors["v.blk.0.ffn_norm.weight"],
+					"patch_embd.weight":        tensors["v.patch_embd.weight"],
+					"position_embd.gate":       tensors["v.position_embd.gate"],
+					"position_embd.weight":     tensors["v.position_embd.weight"],
+				},
+			},
+		},
+	}
+
+	for _, tt := range cases {
+		t.Run(tt.name, func(t *testing.T) {
+			got := Tensors{items: tt.items}.GroupLayers()
+			if diff := cmp.Diff(got, tt.want); diff != "" {
+				t.Errorf("unexpected layers (-got +want):\n%s", diff)
+			}
+		})
+	}
+}

+ 8 - 3
fs/ggml/type.go

@@ -32,9 +32,10 @@ const (
 	fileTypeIQ1_S
 	fileTypeIQ4_NL
 	fileTypeIQ3_S
+	fileTypeIQ3_M
 	fileTypeIQ2_S
-	fileTypeIQ4_XS
 	fileTypeIQ2_M
+	fileTypeIQ4_XS
 	fileTypeIQ1_M
 	fileTypeBF16
 
@@ -93,12 +94,14 @@ func ParseFileType(s string) (fileType, error) {
 		return fileTypeIQ4_NL, nil
 	case "IQ3_S":
 		return fileTypeIQ3_S, nil
+	case "IQ3_M":
+		return fileTypeIQ3_M, nil
 	case "IQ2_S":
 		return fileTypeIQ2_S, nil
-	case "IQ4_XS":
-		return fileTypeIQ4_XS, nil
 	case "IQ2_M":
 		return fileTypeIQ2_M, nil
+	case "IQ4_XS":
+		return fileTypeIQ4_XS, nil
 	case "IQ1_M":
 		return fileTypeIQ1_M, nil
 	case "BF16":
@@ -160,6 +163,8 @@ func (t fileType) String() string {
 		return "IQ4_NL"
 	case fileTypeIQ3_S:
 		return "IQ3_S"
+	case fileTypeIQ3_M:
+		return "IQ3_M"
 	case fileTypeIQ2_S:
 		return "IQ2_S"
 	case fileTypeIQ4_XS:

+ 3 - 3
llm/memory.go

@@ -116,7 +116,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
 
-	layers := f.Tensors().Layers()
+	layers := f.Tensors().GroupLayers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
 		layerSize = blk0.Size()
@@ -410,7 +410,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 		return 0, 0
 	}
 
-	for _, layer := range ggml.Tensors().Layers() {
+	for _, layer := range ggml.Tensors().GroupLayers() {
 		weights += layer.Size()
 	}
 
@@ -431,7 +431,7 @@ func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
 		headCount := kv("attention.head_count")
 
 		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
-		if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
+		if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
 			numPatches++
 		}