1 ヶ月前 · 4ea4d2b189
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -583,39 +583,52 @@ func (f GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partialO
 
				 }
			
 
				 
			
 
				 func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
			
 
				-	switch llm.KV().Architecture() {
			
 
				-	case "mllama":
			
 
				-		for _, layer := range llm.Tensors().GroupLayers()["v"] {
			
 
				-			weights += layer.Size()
			
 
				-		}
			
 
				+	if llm.KV().Uint("vision.block_count") == 0 {
			
 
				+		return
			
 
				+	}
			
 
				 
			
 
				-		kv := func(n string) uint64 {
			
 
				-			if v, ok := llm.KV()["mllama.vision."+n].(uint32); ok {
			
 
				-				return uint64(v)
			
 
				+	for name, layer := range llm.Tensors().GroupLayers() {
			
 
				+		if name == "v" || strings.HasPrefix(name, "v.") {
			
 
				+			for _, tensor := range layer {
			
 
				+				weights += tensor.Size()
			
 
				 			}
			
 
				-
			
 
				-			return 0
			
 
				 		}
			
 
				+	}
			
 
				+
			
 
				+	imageSize := uint64(llm.KV().Uint("vision.image_size"))
			
 
				+	patchSize := uint64(llm.KV().Uint("vision.patch_size"))
			
 
				+	if patchSize == 0 {
			
 
				+		slog.Warn("unknown patch size for vision model")
			
 
				+		return
			
 
				+	}
			
 
				 
			
 
				-		imageSize := kv("image_size")
			
 
				+	numChannels := uint64(llm.KV().Uint("vision.num_channels"))
			
 
				 
			
 
				-		maxNumTiles := kv("max_num_tiles")
			
 
				-		embeddingLength := kv("embedding_length")
			
 
				-		headCount := kv("attention.head_count")
			
 
				+	numPatches := (imageSize / patchSize) * (imageSize / patchSize)
			
 
				+	if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
			
 
				+		numPatches++
			
 
				+	}
			
 
				 
			
 
				-		numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
			
 
				-		if _, ok := llm.Tensors().GroupLayers()["v"]["class_embd"]; ok {
			
 
				-			numPatches++
			
 
				-		}
			
 
				+	headCount := uint64(llm.KV().Uint("vision.attention.head_count"))
			
 
				+	embeddingLength := uint64(llm.KV().Uint("vision.embedding_length"))
			
 
				 
			
 
				+	switch llm.KV().Architecture() {
			
 
				+	case "mllama":
			
 
				 		numPaddedPatches := numPatches + 8 - (numPatches%8)%8
			
 
				 
			
 
				+		maxNumTiles := uint64(llm.KV().Uint("vision.max_num_tiles"))
			
 
				+
			
 
				 		graphSize = 4 * (8 +
			
 
				-			imageSize*imageSize*kv("num_channels")*maxNumTiles +
			
 
				+			imageSize*imageSize*numChannels*maxNumTiles +
			
 
				 			embeddingLength*numPatches*maxNumTiles +
			
 
				 			9*embeddingLength*numPaddedPatches*maxNumTiles +
			
 
				 			numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
			
 
				+	case "gemma3":
			
 
				+		graphSize = 4 * (imageSize*imageSize*numChannels +
			
 
				+			embeddingLength*patchSize +
			
 
				+			numPatches*numPatches*headCount)
			
 
				 	}
			
 
				+
			
 
				 	return weights, graphSize
			
 
				 }
			
 
				 
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -218,8 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 
				 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			
 
				 			layerSize = blk.Size()
			
 
				 			layerSize += kv / f.KV().BlockCount()
			
 
				+			memoryWeights += blk.Size()
			
 
				 		}
			
 
				-		memoryWeights += layerSize
			
 
				 
			
 
				 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
			
 
				 			// Stop allocating on GPU(s) once we hit the users target NumGPU
			
@@ -376,7 +376,7 @@ func (m MemoryEstimate) LogValue() slog.Value {
 
				 				// memory of the weights
			
 
				 				"total", format.HumanBytes2(m.memoryWeights),
			
 
				 				// memory of repeating layers
			
 
				-				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
			
 
				+				"repeating", format.HumanBytes2(m.memoryWeights),
			
 
				 				// memory of non-repeating layers
			
 
				 				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
			
 
				 			),