10 miesięcy temu · 7784ca33ce
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -49,6 +49,18 @@ type MemoryEstimate struct {
 
				 
			
 
				 	// For multi-GPU scenarios, this is the size in bytes per GPU
			
 
				 	GPUSizes []uint64
			
 
				+
			
 
				+	// internal fields for logging purposes
			
 
				+	inferenceLibrary    string
			
 
				+	layersRequested     int
			
 
				+	layersModel         int
			
 
				+	availableList       []string
			
 
				+	kv                  uint64
			
 
				+	allocationsList     []string
			
 
				+	memoryWeights       uint64
			
 
				+	memoryLayerOutput   uint64
			
 
				+	graphFullOffload    uint64
			
 
				+	graphPartialOffload uint64
			
 
				 }
			
 
				 
			
 
				 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
			
@@ -252,78 +264,86 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		allocationsList = append(allocationsList, format.HumanBytes2(a))
			
 
				 	}
			
 
				 
			
 
				+	estimate := MemoryEstimate{
			
 
				+		TotalSize: memoryRequiredTotal,
			
 
				+		Layers:    0,
			
 
				+		Graph:     0,
			
 
				+		VRAMSize:  0,
			
 
				+		GPUSizes:  []uint64{},
			
 
				+
			
 
				+		inferenceLibrary:    gpus[0].Library,
			
 
				+		layersRequested:     opts.NumGPU,
			
 
				+		layersModel:         int(ggml.KV().BlockCount()) + 1,
			
 
				+		availableList:       availableList,
			
 
				+		kv:                  kv,
			
 
				+		allocationsList:     allocationsList,
			
 
				+		memoryWeights:       memoryWeights,
			
 
				+		memoryLayerOutput:   memoryLayerOutput,
			
 
				+		graphFullOffload:    graphFullOffload,
			
 
				+		graphPartialOffload: graphPartialOffload,
			
 
				+	}
			
 
				+
			
 
				+	if gpus[0].Library == "cpu" {
			
 
				+		return estimate
			
 
				+	}
			
 
				+	if layerCount == 0 {
			
 
				+		slog.Debug("insufficient VRAM to load any model layers")
			
 
				+		return estimate
			
 
				+	}
			
 
				+	estimate.Layers = layerCount
			
 
				+	estimate.Graph = graphOffload
			
 
				+	estimate.VRAMSize = memoryRequiredPartial
			
 
				+	estimate.TotalSize = memoryRequiredTotal
			
 
				+	estimate.TensorSplit = tensorSplit
			
 
				+	estimate.GPUSizes = gpuAllocations
			
 
				+	return estimate
			
 
				+}
			
 
				+
			
 
				+func (m MemoryEstimate) log() {
			
 
				 	slog.Info(
			
 
				-		"offload to gpu",
			
 
				+		"offload to "+m.inferenceLibrary,
			
 
				 		slog.Group(
			
 
				 			"layers",
			
 
				 			// requested number of layers to offload
			
 
				-			"requested", opts.NumGPU,
			
 
				+			"requested", m.layersRequested,
			
 
				 			// The number of layers the model has (including output)
			
 
				-			"model", int(ggml.KV().BlockCount())+1,
			
 
				+			"model", m.layersModel,
			
 
				 			// estimated number of layers that can be offloaded
			
 
				-			"offload", layerCount,
			
 
				-			// multi-gpu split for tesnors
			
 
				-			"split", tensorSplit,
			
 
				+			"offload", m.Layers,
			
 
				+			// multi-gpu split for tensors
			
 
				+			"split", m.TensorSplit,
			
 
				 		),
			
 
				 		slog.Group(
			
 
				 			"memory",
			
 
				 			// memory available by GPU for offloading
			
 
				-			"available", availableList,
			
 
				+			"available", m.availableList,
			
 
				 			slog.Group(
			
 
				 				"required",
			
 
				 				// memory required for full offloading
			
 
				-				"full", format.HumanBytes2(memoryRequiredTotal),
			
 
				+				"full", format.HumanBytes2(m.TotalSize),
			
 
				 				// memory required to offload layers.estimate layers
			
 
				-				"partial", format.HumanBytes2(memoryRequiredPartial),
			
 
				+				"partial", format.HumanBytes2(m.VRAMSize),
			
 
				 				// memory of KV cache
			
 
				-				"kv", format.HumanBytes2(kv),
			
 
				+				"kv", format.HumanBytes2(m.kv),
			
 
				 				// Allocations across the GPUs
			
 
				-				"allocations", allocationsList,
			
 
				+				"allocations", m.allocationsList,
			
 
				 			),
			
 
				 			slog.Group(
			
 
				 				"weights",
			
 
				 				// memory of the weights
			
 
				-				"total", format.HumanBytes2(memoryWeights),
			
 
				+				"total", format.HumanBytes2(m.memoryWeights),
			
 
				 				// memory of repeating layers
			
 
				-				"repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
			
 
				+				"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
			
 
				 				// memory of non-repeating layers
			
 
				-				"nonrepeating", format.HumanBytes2(memoryLayerOutput),
			
 
				+				"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
			
 
				 			),
			
 
				 			slog.Group(
			
 
				 				"graph",
			
 
				 				// memory of graph when fully offloaded
			
 
				-				"full", format.HumanBytes2(graphFullOffload),
			
 
				+				"full", format.HumanBytes2(m.graphFullOffload),
			
 
				 				// memory of graph when not fully offloaded
			
 
				-				"partial", format.HumanBytes2(graphPartialOffload),
			
 
				+				"partial", format.HumanBytes2(m.graphPartialOffload),
			
 
				 			),
			
 
				 		),
			
 
				 	)
			
 
				-	if gpus[0].Library == "cpu" {
			
 
				-		return MemoryEstimate{
			
 
				-			Layers:    0,
			
 
				-			Graph:     0,
			
 
				-			VRAMSize:  0,
			
 
				-			TotalSize: memoryRequiredTotal,
			
 
				-			GPUSizes:  []uint64{},
			
 
				-		}
			
 
				-	}
			
 
				-	if layerCount == 0 {
			
 
				-		slog.Debug("insufficient VRAM to load any model layers")
			
 
				-		return MemoryEstimate{
			
 
				-			Layers:    0,
			
 
				-			Graph:     0,
			
 
				-			VRAMSize:  0,
			
 
				-			TotalSize: memoryRequiredTotal,
			
 
				-			GPUSizes:  []uint64{},
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return MemoryEstimate{
			
 
				-		Layers:      layerCount,
			
 
				-		Graph:       graphOffload,
			
 
				-		VRAMSize:    memoryRequiredPartial,
			
 
				-		TotalSize:   memoryRequiredTotal,
			
 
				-		TensorSplit: tensorSplit,
			
 
				-		GPUSizes:    gpuAllocations,
			
 
				-	}
			
 
				 }
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -116,6 +116,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	estimate.log()
			
 
				+
			
 
				 	// Loop through potential servers
			
 
				 	finalErr := errors.New("no suitable llama servers found")