8 kuukautta sitten · 69207b4987
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -83,7 +83,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	var memoryLayerOutput uint64
			
 
				 
			
 
				 	// The sizes of a layer
			
 
				-	var layerSize uint64
			
 
				+	var baseLayerSize uint64
			
 
				 
			
 
				 	// The sum of all the layer sizes (just for logging)
			
 
				 	var memoryWeights uint64
			
@@ -110,27 +110,27 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	layers := ggml.Tensors().Layers()
			
 
				 	// add one layer worth of memory as a buffer
			
 
				 	if blk0, ok := layers["blk.0"]; ok {
			
 
				-		layerSize = blk0.size()
			
 
				+		baseLayerSize = blk0.size()
			
 
				 	} else {
			
 
				 		slog.Warn("model missing blk.0 layer size")
			
 
				 	}
			
 
				 
			
 
				 	// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
			
 
				-	var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
			
 
				-
			
 
				-	// KV is proportional to the number of layers
			
 
				-	layerSize += kv / ggml.KV().BlockCount()
			
 
				+	kv := 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
			
 
				+	layerKV := kv / ggml.KV().BlockCount()
			
 
				+	baseLayerSize += layerKV
			
 
				 
			
 
				 	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
			
 
				 	if graphPartialOffload == 0 {
			
 
				 		graphPartialOffload = ggml.KV().GQA() * kv / 6
			
 
				 	}
			
 
				+
			
 
				 	if graphFullOffload == 0 {
			
 
				 		graphFullOffload = graphPartialOffload
			
 
				 	}
			
 
				 
			
 
				-	// on metal there's no partial offload overhead
			
 
				 	if gpus[0].Library == "metal" {
			
 
				+		// there's no partial offload overhead on metal
			
 
				 		graphPartialOffload = graphFullOffload
			
 
				 	} else if len(gpus) > 1 {
			
 
				 		// multigpu should always use the partial graph size
			
@@ -140,6 +140,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	if layer, ok := layers["output_norm"]; ok {
			
 
				 		memoryLayerOutput += layer.size()
			
 
				 	}
			
 
				+
			
 
				 	if layer, ok := layers["output"]; ok {
			
 
				 		memoryLayerOutput += layer.size()
			
 
				 	} else if layer, ok := layers["token_embd"]; ok {
			
@@ -164,12 +165,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			gzo = gpuZeroOverhead
			
 
				 		}
			
 
				 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
			
 
				-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
			
 
				+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*baseLayerSize {
			
 
				 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
			
 
				 			continue
			
 
				 		}
			
 
				 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
			
 
				-		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
			
 
				+		gpuAllocations[i] += gpus[i].MinimumMemory + baseLayerSize // We hold off on graph until we know partial vs. full
			
 
				 	}
			
 
				 
			
 
				 	var gpuZeroID int
			
@@ -180,11 +181,14 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 
			
 
				 	// For all the layers, find where they can fit on the GPU(s)
			
 
				 	for i := range int(ggml.KV().BlockCount()) {
			
 
				-		// Some models have inconsistent layer sizes
			
 
				+		var layerSize uint64
			
 
				 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			
 
				 			layerSize = blk.size()
			
 
				-			layerSize += kv / ggml.KV().BlockCount()
			
 
				+		} else {
			
 
				+			slog.Error("missing layer", "blk", i)
			
 
				+			continue
			
 
				 		}
			
 
				+
			
 
				 		memoryWeights += layerSize
			
 
				 
			
 
				 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
			
@@ -196,8 +200,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				 			g := gpusWithSpace[i%j]
			
 
				 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				-			if g.g.FreeMemory > used+layerSize {
			
 
				-				gpuAllocations[g.i] += layerSize
			
 
				+			if g.g.FreeMemory > used+layerSize+layerKV {
			
 
				+				gpuAllocations[g.i] += layerSize + layerKV
			
 
				 				layerCounts[g.i]++
			
 
				 				layerCount++
			
 
				 				break
			
@@ -206,11 +210,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				 	if layerCount >= int(ggml.KV().BlockCount()) {
			
 
				 		fullyLoaded = true
			
 
				 	} else {
			
 
				 		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
			
 
				-			overflow += layerSize
			
 
				+			overflow += baseLayerSize
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -265,9 +270,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		}
			
 
				 		tensorSplit = strings.Join(splits, ",")
			
 
				 	}
			
 
				-	allocationsList := []string{}
			
 
				-	for _, a := range gpuAllocations {
			
 
				-		allocationsList = append(allocationsList, format.HumanBytes2(a))
			
 
				+
			
 
				+	allocationsList := make([]string, len(gpuAllocations))
			
 
				+	for i, a := range gpuAllocations {
			
 
				+		allocationsList[i] = format.HumanBytes2(a)
			
 
				 	}
			
 
				 
			
 
				 	estimate := MemoryEstimate{
			
@@ -337,9 +343,9 @@ func (m MemoryEstimate) log() {
 
				 			slog.Group(
			
 
				 				"weights",
			
 
				 				// memory of the weights
			
 
				-				"total", format.HumanBytes2(m.memoryWeights),
			
 
				+				"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
			
 
				 				// memory of repeating layers
			
 
				-				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
			
 
				+				"repeating", format.HumanBytes2(m.memoryWeights),
			
 
				 				// memory of non-repeating layers
			
 
				 				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
			
 
				 			),
			
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -62,6 +62,15 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				 		assert.Equal(t, 0, estimate.Layers)
			
 
				 		assert.Equal(t, uint64(0), estimate.Graph)
			
 
				+
			
 
				+		// 5 layers * 4 bytes per layer
			
 
				+		if estimate.memoryWeights != 20 {
			
 
				+			t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
			
 
				+		}
			
 
				+
			
 
				+		if estimate.memoryLayerOutput != 4 {
			
 
				+			t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
			
 
				+		}
			
 
				 	})
			
 
				 
			
 
				 	// derived from the dummy ggml file above
			
@@ -124,6 +133,15 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				 				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				 			}
			
 
				+
			
 
				+			// 5 layers * 4 bytes per layer
			
 
				+			if estimate.memoryWeights != 20 {
			
 
				+				t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
			
 
				+			}
			
 
				+
			
 
				+			if estimate.memoryLayerOutput != 4 {
			
 
				+				t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
			
 
				+			}
			
 
				 		})
			
 
				 	}
			
 
				 }