1 year ago · bee2f4a3b0
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -53,6 +53,8 @@ func HumanBytes(b int64) string {
 
				 
			
 
				 func HumanBytes2(b uint64) string {
			
 
				 	switch {
			
 
				+	case b >= GibiByte:
			
 
				+		return fmt.Sprintf("%.1f GiB", float64(b)/GibiByte)
			
 
				 	case b >= MebiByte:
			
 
				 		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
			
 
				 	case b >= KibiByte:
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -25,7 +25,7 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
				 	// Split up the GPUs by type and try them
			
 
				 	for _, gpus := range allGpus.ByLibrary() {
			
 
				 		var layerCount int
			
 
				-		layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				 		if opts.NumGPU < 0 {
			
 
				 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
			
 
				 				return true, estimatedVRAM
			
@@ -39,12 +39,9 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
				 	return false, estimatedVRAM
			
 
				 }
			
 
				 
			
 
				-// Given a model and one or more GPU targets, predict how many layers and bytes we can load
			
 
				+// Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
			
 
				 // The GPUs provided must all be the same Library
			
 
				-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
			
 
				-	if gpus[0].Library == "cpu" {
			
 
				-		return 0, 0
			
 
				-	}
			
 
				+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
			
 
				 	var memoryAvailable uint64
			
 
				 	for _, info := range gpus {
			
 
				 		memoryAvailable += info.FreeMemory
			
@@ -93,11 +90,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
			
 
				 	memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
			
 
				 
			
 
				-	if memoryRequiredPartial > memoryAvailable {
			
 
				-		slog.Debug("insufficient VRAM to load any model layers")
			
 
				-		return 0, 0
			
 
				-	}
			
 
				-
			
 
				 	var memoryLayerOutput uint64
			
 
				 	if layer, ok := layers["output_norm"]; ok {
			
 
				 		memoryLayerOutput += layer.size()
			
@@ -181,5 +173,13 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			),
			
 
				 		),
			
 
				 	)
			
 
				-	return layerCount, uint64(memoryRequiredPartial)
			
 
				+	if gpus[0].Library == "cpu" {
			
 
				+		return 0, 0, memoryRequiredTotal
			
 
				+	}
			
 
				+	if memoryRequiredPartial > memoryAvailable {
			
 
				+		slog.Debug("insufficient VRAM to load any model layers")
			
 
				+		return 0, 0, memoryRequiredTotal
			
 
				+	}
			
 
				+
			
 
				+	return layerCount, memoryRequiredPartial, memoryRequiredTotal
			
 
				 }
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,7 +49,10 @@ type llmServer struct {
 
				 	options api.Options
			
 
				 
			
 
				 	// TODO - this should be broken down by GPU
			
 
				-	estimatedVRAM uint64 // Estimated usage of VRAM by the loaded model
			
 
				+	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
			
 
				+	estimatedTotal uint64 // Total size of model
			
 
				+	totalLayers    uint64
			
 
				+	gpuCount       int
			
 
				 
			
 
				 	sem *semaphore.Weighted
			
 
				 }
			
@@ -83,12 +86,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 	cpuRunner := ""
			
 
				 	var estimatedVRAM uint64
			
 
				+	var estimatedTotal uint64
			
 
				 	var systemMemory uint64
			
 
				+	gpuCount := len(gpus)
			
 
				 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
			
 
				 
			
 
				 		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
			
 
				 
			
 
				 		cpuRunner = serverForCpu()
			
 
				+		gpuCount = 0
			
 
				 	} else {
			
 
				 		if gpus[0].Library == "metal" {
			
 
				 			memInfo, err := gpu.GetCPUMem()
			
@@ -100,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 			}
			
 
				 		}
			
 
				 		var layers int
			
 
				-		layers, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				 
			
 
				 		if gpus[0].Library == "metal" && estimatedVRAM > systemMemory {
			
 
				 			// disable partial offloading when model is greater than total system memory as this
			
@@ -133,6 +139,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		} else {
			
 
				 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
			
 
				 			servers = []string{demandLib}
			
 
				+			if strings.HasPrefix(demandLib, "cpu") {
			
 
				+				// Omit the GPU flag to silence the warning
			
 
				+				opts.NumGPU = -1
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -214,6 +224,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				+		if strings.HasPrefix(servers[i], "cpu") {
			
 
				+			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
			
 
				+			gpuCount = 0
			
 
				+		}
			
 
				+
			
 
				 		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
			
 
				 		port := 0
			
 
				 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
			
@@ -267,12 +282,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		}
			
 
				 
			
 
				 		s := &llmServer{
			
 
				-			port:          port,
			
 
				-			cmd:           exec.Command(server, finalParams...),
			
 
				-			status:        NewStatusWriter(os.Stderr),
			
 
				-			options:       opts,
			
 
				-			estimatedVRAM: estimatedVRAM,
			
 
				-			sem:           semaphore.NewWeighted(int64(numParallel)),
			
 
				+			port:           port,
			
 
				+			cmd:            exec.Command(server, finalParams...),
			
 
				+			status:         NewStatusWriter(os.Stderr),
			
 
				+			options:        opts,
			
 
				+			estimatedVRAM:  estimatedVRAM,
			
 
				+			estimatedTotal: estimatedTotal,
			
 
				+			sem:            semaphore.NewWeighted(int64(numParallel)),
			
 
				+			totalLayers:    ggml.KV().BlockCount() + 1,
			
 
				+			gpuCount:       gpuCount,
			
 
				 		}
			
 
				 
			
 
				 		s.cmd.Env = os.Environ()