9 月之前 · 791650ddef
--- a/llm/server.go
+++ b/llm/server.go
@@ -122,6 +122,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	// On linux, over-allocating CPU memory will almost always result in an error
			
 
				+	if runtime.GOOS == "linux" {
			
 
				+		systemMemoryRequired := estimate.TotalSize - estimate.VRAMSize
			
 
				+		if systemMemoryRequired > systemTotalMemory {
			
 
				+			slog.Warn("model request too large for system", "requested", format.HumanBytes2(systemMemoryRequired), "system", format.HumanBytes2(systemTotalMemory))
			
 
				+			return nil, fmt.Errorf("model requires more system memory (%s) than is available (%s)", format.HumanBytes2(systemMemoryRequired), format.HumanBytes2(systemTotalMemory))
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	estimate.log()
			
 
				 
			
 
				 	// Loop through potential servers
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -135,11 +135,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 			}
			
 
				 
			
 
				 			for {
			
 
				-				cpus := s.getCpuFn()
			
 
				-				var systemMem gpu.GpuInfo
			
 
				-				if len(cpus) > 0 {
			
 
				-					systemMem = cpus[0]
			
 
				-				}
			
 
				 				var runnerToExpire *runnerRef
			
 
				 				s.loadedMu.Lock()
			
 
				 				runner := s.loaded[pending.model.ModelPath]
			
@@ -193,38 +188,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						break
			
 
				 					}
			
 
				 
			
 
				-					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
			
 
				-					maxSize := systemMem.FreeMemory
			
 
				-
			
 
				-					// Add available GPU memory to the total pool
			
 
				-					// macOS hardware has unified memory so don't double count
			
 
				-					if runtime.GOOS != "darwin" {
			
 
				-						for _, gpu := range gpus {
			
 
				-							if gpu.Library == "cpu" {
			
 
				-								continue
			
 
				-							}
			
 
				-							if loadedCount == 0 {
			
 
				-								// If no other models are loaded, set the limit based on what's available
			
 
				-								maxSize += gpu.FreeMemory
			
 
				-							} else {
			
 
				-								// Other models could be unloaded, favor total memory for limit
			
 
				-								maxSize += gpu.TotalMemory
			
 
				-							}
			
 
				-						}
			
 
				-					}
			
 
				-
			
 
				-					// Block attempting to load a model larger than system memory + GPU memory
			
 
				-					if estimate.TotalSize > maxSize {
			
 
				-						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
			
 
				-
			
 
				-						// Linux will crash if over-allocating memory - return an error to the user.
			
 
				-						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
			
 
				-						if runtime.GOOS == "linux" {
			
 
				-							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
			
 
				-							break
			
 
				-						}
			
 
				-					}
			
 
				-
			
 
				 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
			
 
				 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
			
 
				 						// simplifying assumption of defaultParallel when in CPU mode