Selaa lähdekoodia

Refine CPU load behavior with system memory visibility

Daniel Hiltgen 11 kuukautta sitten
vanhempi
commit
fc37c192ae
7 muutettua tiedostoa jossa 211 lisäystä ja 98 poistoa
  1. 83 13
      gpu/gpu.go
  2. 11 0
      gpu/gpu_darwin.go
  3. 1 5
      gpu/gpu_info_cpu.c
  4. 2 10
      gpu/gpu_info_nvml.c
  5. 24 13
      llm/server.go
  6. 56 31
      server/sched.go
  7. 34 26
      server/sched_test.go

+ 83 - 13
gpu/gpu.go

@@ -11,6 +11,8 @@ package gpu
 */
 */
 import "C"
 import "C"
 import (
 import (
+	"bufio"
+	"bytes"
 	"fmt"
 	"fmt"
 	"log/slog"
 	"log/slog"
 	"os"
 	"os"
@@ -246,6 +248,17 @@ func initOneAPIHandles() *oneapiHandles {
 	return oHandles
 	return oHandles
 }
 }
 
 
+func GetCPUInfo() GpuInfoList {
+	gpuMutex.Lock()
+	if !bootstrapped {
+		gpuMutex.Unlock()
+		GetGPUInfo()
+	} else {
+		gpuMutex.Unlock()
+	}
+	return GpuInfoList{cpus[0].GpuInfo}
+}
+
 func GetGPUInfo() GpuInfoList {
 func GetGPUInfo() GpuInfoList {
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// TODO - consider exploring lspci (and equivalent on windows) to check for
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
@@ -279,22 +292,19 @@ func GetGPUInfo() GpuInfoList {
 		needRefresh = false
 		needRefresh = false
 		cpuCapability = getCPUCapability()
 		cpuCapability = getCPUCapability()
 		var memInfo C.mem_info_t
 		var memInfo C.mem_info_t
-		C.cpu_check_ram(&memInfo)
-		if memInfo.err != nil {
-			slog.Info("error looking up CPU memory", "error", C.GoString(memInfo.err))
-			C.free(unsafe.Pointer(memInfo.err))
-			return []GpuInfo{}
+
+		mem, err := GetCPUMem()
+		if err != nil {
+			slog.Warn("error looking up system memory", "error", err)
 		}
 		}
-		cpuInfo := CPUInfo{
+		cpus = []CPUInfo{CPUInfo{
 			GpuInfo: GpuInfo{
 			GpuInfo: GpuInfo{
+				memInfo: mem,
 				Library: "cpu",
 				Library: "cpu",
 				Variant: cpuCapability.ToVariant(),
 				Variant: cpuCapability.ToVariant(),
+				ID:      "0",
 			},
 			},
-		}
-		cpuInfo.TotalMemory = uint64(memInfo.total)
-		cpuInfo.FreeMemory = uint64(memInfo.free)
-		cpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		cpus = []CPUInfo{cpuInfo}
+		}}
 
 
 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
@@ -394,7 +404,25 @@ func GetGPUInfo() GpuInfoList {
 
 
 	// Refresh free memory usage
 	// Refresh free memory usage
 	if needRefresh {
 	if needRefresh {
-		// TODO - CPU system memory tracking/refresh
+		mem, err := GetCPUMem()
+		if err != nil {
+			slog.Warn("error looking up system memory", "error", err)
+		} else {
+			slog.Debug("updating system memory data",
+				slog.Group(
+					"before",
+					"total", format.HumanBytes2(cpus[0].TotalMemory),
+					"free", format.HumanBytes2(cpus[0].FreeMemory),
+				),
+				slog.Group(
+					"now",
+					"total", format.HumanBytes2(mem.TotalMemory),
+					"free", format.HumanBytes2(mem.FreeMemory),
+				),
+			)
+			cpus[0].FreeMemory = mem.FreeMemory
+		}
+
 		var memInfo C.mem_info_t
 		var memInfo C.mem_info_t
 		if cHandles == nil && len(cudaGPUs) > 0 {
 		if cHandles == nil && len(cudaGPUs) > 0 {
 			cHandles = initCudaHandles()
 			cHandles = initCudaHandles()
@@ -455,7 +483,7 @@ func GetGPUInfo() GpuInfoList {
 			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
 			oneapiGPUs[i].FreeMemory = uint64(memInfo.free)
 		}
 		}
 
 
-		err := RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
+		err = RocmGPUInfoList(rocmGPUs).RefreshFreeMemory()
 		if err != nil {
 		if err != nil {
 			slog.Debug("problem refreshing ROCm free memory", "error", err)
 			slog.Debug("problem refreshing ROCm free memory", "error", err)
 		}
 		}
@@ -478,6 +506,9 @@ func GetGPUInfo() GpuInfoList {
 }
 }
 
 
 func GetCPUMem() (memInfo, error) {
 func GetCPUMem() (memInfo, error) {
+	if runtime.GOOS == "linux" {
+		return GetLinuxMemInfo()
+	}
 	var ret memInfo
 	var ret memInfo
 	var info C.mem_info_t
 	var info C.mem_info_t
 	C.cpu_check_ram(&info)
 	C.cpu_check_ram(&info)
@@ -651,3 +682,42 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 		return "", ""
 	}
 	}
 }
 }
+
+func GetLinuxMemInfo() (memInfo, error) {
+	var mem memInfo
+	var total, available, free, buffers, cached uint64
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return mem, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		switch {
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
+		case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
+			_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
+		case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
+			_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
+		case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
+			_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
+		default:
+			continue
+		}
+		if err != nil {
+			return mem, err
+		}
+
+		if total > 0 && available > 0 {
+			mem.TotalMemory = total * 1024
+			mem.FreeMemory = available * 1024
+			return mem, nil
+		}
+	}
+	mem.TotalMemory = total * 1024
+	mem.FreeMemory = (free + buffers + cached) * 1024
+	return mem, nil
+}

+ 11 - 0
gpu/gpu_darwin.go

@@ -42,6 +42,17 @@ func GetGPUInfo() GpuInfoList {
 	return []GpuInfo{info}
 	return []GpuInfo{info}
 }
 }
 
 
+func GetCPUInfo() GpuInfoList {
+	mem, _ := GetCPUMem()
+	return []GpuInfo{
+		{
+			Library: "cpu",
+			Variant: GetCPUVariant(),
+			memInfo: mem,
+		},
+	}
+}
+
 func GetCPUMem() (memInfo, error) {
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
 		TotalMemory: uint64(C.getPhysicalMemory()),

+ 1 - 5
gpu/gpu_info_cpu.c

@@ -35,11 +35,7 @@ void cpu_check_ram(mem_info_t *resp) {
 }
 }
 
 
 #elif __APPLE__
 #elif __APPLE__
-// TODO consider an Apple implementation that does something useful
-// mem_info_t cpu_check_ram() {
-//   mem_info_t resp = {0, 0, NULL};
-//   return resp;
-// }
+// Unused - see gpu_darwin.go
 #else
 #else
 #error "Unsupported platform"
 #error "Unsupported platform"
 #endif
 #endif

+ 2 - 10
gpu/gpu_info_nvml.c

@@ -11,8 +11,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
   char buf[buflen + 1];
   char buf[buflen + 1];
   int i;
   int i;
 
 
-  LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
-
   struct lookup {
   struct lookup {
     char *s;
     char *s;
     void **p;
     void **p;
@@ -37,13 +35,11 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
   }
   }
 
 
   // TODO once we've squashed the remaining corner cases remove this log
   // TODO once we've squashed the remaining corner cases remove this log
-//   LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  // LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
   
   
-    LOG(1, "XXX wiring functions nvml_init\n");
-
   for (i = 0; l[i].s != NULL; i++) {
   for (i = 0; l[i].s != NULL; i++) {
     // TODO once we've squashed the remaining corner cases remove this log
     // TODO once we've squashed the remaining corner cases remove this log
-    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+    // LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
 
 
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     if (!l[i].p) {
     if (!l[i].p) {
@@ -58,7 +54,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
       return;
       return;
     }
     }
   }
   }
-    LOG(1, "XXX calling init_v2\n");
 
 
   ret = (*resp->ch.nvmlInit_v2)();
   ret = (*resp->ch.nvmlInit_v2)();
   if (ret != NVML_SUCCESS) {
   if (ret != NVML_SUCCESS) {
@@ -69,8 +64,6 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
     resp->err = strdup(buf);
     resp->err = strdup(buf);
     return;
     return;
   }
   }
-      LOG(1, "XXX nvml_init done\n");
-
 }
 }
 
 
 
 
@@ -78,7 +71,6 @@ void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *tot
     nvmlDevice_t device;
     nvmlDevice_t device;
     nvmlMemory_t memInfo = {0};
     nvmlMemory_t memInfo = {0};
     nvmlReturn_t ret;
     nvmlReturn_t ret;
-    LOG(1, "XXX in nvml_get_free\n");
     ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
     ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
     if (ret != NVML_SUCCESS) {
     if (ret != NVML_SUCCESS) {
         LOG(1, "unable to get device handle %d: %d", device_id, ret);
         LOG(1, "unable to get device handle %d: %d", device_id, ret);

+ 24 - 13
llm/server.go

@@ -37,8 +37,9 @@ type LlamaServer interface {
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Tokenize(ctx context.Context, content string) ([]int, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Detokenize(ctx context.Context, tokens []int) (string, error)
 	Close() error
 	Close() error
-	EstimatedVRAM() uint64
+	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
 	EstimatedTotal() uint64
+	EstimagedVRAMByGPU(gpuID string) uint64
 }
 }
 
 
 // llmServer is an instance of the llama.cpp server
 // llmServer is an instance of the llama.cpp server
@@ -49,10 +50,11 @@ type llmServer struct {
 	status  *StatusWriter
 	status  *StatusWriter
 	options api.Options
 	options api.Options
 
 
-	estimate     MemoryEstimate
-	totalLayers  uint64
-	gpuCount     int
-	loadDuration time.Duration // Record how long it took the model to load
+	estimate    MemoryEstimate
+	totalLayers uint64
+	// gpuCount     int
+	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
+	loadDuration time.Duration   // Record how long it took the model to load
 	loadProgress float32
 	loadProgress float32
 
 
 	sem *semaphore.Weighted
 	sem *semaphore.Weighted
@@ -80,12 +82,13 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var cpuRunner string
 	var cpuRunner string
 	var estimate MemoryEstimate
 	var estimate MemoryEstimate
 	var systemMemory uint64
 	var systemMemory uint64
-	gpuCount := len(gpus)
-	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
-		// TODO evaluate system memory to see if we should block the load, or force an unload of another CPU runner
 
 
+	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
+	if opts.NumGPU == 0 {
+		gpus = gpu.GetCPUInfo()
+	}
+	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = serverForCpu()
 		cpuRunner = serverForCpu()
-		gpuCount = 0
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 	} else {
 		if gpus[0].Library == "metal" {
 		if gpus[0].Library == "metal" {
@@ -107,7 +110,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
 			cpuRunner = serverForCpu()
-			gpuCount = 0
+			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 			opts.NumGPU = estimate.Layers
 		}
 		}
@@ -246,8 +249,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 		}
 
 
 		if strings.HasPrefix(servers[i], "cpu") {
 		if strings.HasPrefix(servers[i], "cpu") {
-			// TODO if we tried a gpu runner first, and it failed, record the error and bubble that back up
-			gpuCount = 0
+			gpus = gpu.GetCPUInfo()
 		}
 		}
 
 
 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
@@ -310,7 +312,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			estimate:    estimate,
 			estimate:    estimate,
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			sem:         semaphore.NewWeighted(int64(numParallel)),
 			totalLayers: ggml.KV().BlockCount() + 1,
 			totalLayers: ggml.KV().BlockCount() + 1,
-			gpuCount:    gpuCount,
+			gpus:        gpus,
 			done:        make(chan error, 1),
 			done:        make(chan error, 1),
 		}
 		}
 
 
@@ -1014,6 +1016,15 @@ func (s *llmServer) EstimatedTotal() uint64 {
 	return s.estimate.TotalSize
 	return s.estimate.TotalSize
 }
 }
 
 
+func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
+	for i, gpu := range s.gpus {
+		if gpu.ID == gpuID {
+			return s.estimate.GPUSizes[i]
+		}
+	}
+	return 0
+}
+
 func parseDurationMs(ms float64) time.Duration {
 func parseDurationMs(ms float64) time.Duration {
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
 	if err != nil {
 	if err != nil {

+ 56 - 31
server/sched.go

@@ -7,7 +7,6 @@ import (
 	"log/slog"
 	"log/slog"
 	"reflect"
 	"reflect"
 	"runtime"
 	"runtime"
-	"slices"
 	"sort"
 	"sort"
 	"strings"
 	"strings"
 	"sync"
 	"sync"
@@ -41,6 +40,7 @@ type Scheduler struct {
 	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 	loadFn      func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
 	newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
 	getGpuFn    func() gpu.GpuInfoList
 	getGpuFn    func() gpu.GpuInfoList
+	getCpuFn    func() gpu.GpuInfoList
 }
 }
 
 
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
@@ -54,6 +54,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		loaded:        make(map[string]*runnerRef),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		newServerFn:   llm.NewLlamaServer,
 		getGpuFn:      gpu.GetGPUInfo,
 		getGpuFn:      gpu.GetGPUInfo,
+		getCpuFn:      gpu.GetCPUInfo,
 	}
 	}
 	sched.loadFn = sched.load
 	sched.loadFn = sched.load
 	return sched
 	return sched
@@ -131,7 +132,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
 					// Get a refreshed GPU list
-					gpus := s.getGpuFn()
+					var gpus gpu.GpuInfoList
+					if pending.opts.NumGPU == 0 {
+						gpus = s.getCpuFn()
+					} else {
+						gpus = s.getGpuFn()
+					}
 
 
 					// Load model for fitting
 					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
@@ -140,16 +146,22 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 						break
 					}
 					}
 
 
-					// If we're CPU only mode, just limit by envconfig.MaxRunners above
-					// TODO handle system memory exhaustion
-					if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
-						slog.Debug("cpu mode with existing models, loading")
-						s.loadFn(pending, ggml, gpus)
-						break
-					}
-
-					// No models loaded. Load the model but prefer the best fit.
-					if loadedCount == 0 {
+					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
+					if len(gpus) == 1 && gpus[0].Library == "cpu" {
+						if loadedCount == 0 {
+							slog.Debug("cpu mode with first model, loading")
+							s.loadFn(pending, ggml, gpus)
+							break
+						}
+						runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
+						if runnerToExpire == nil {
+							slog.Debug("cpu mode with available system memory or first model, loading")
+							s.loadFn(pending, ggml, gpus)
+							break
+						}
+						// else we need to expire a runner
+					} else if loadedCount == 0 {
+						// No models loaded. Load the model but prefer the best fit.
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
 						slog.Debug("loading first model", "model", pending.model.ModelPath)
 						g := pickBestFitGPUs(pending, ggml, gpus)
 						g := pickBestFitGPUs(pending, ggml, gpus)
 						if g != nil {
 						if g != nil {
@@ -159,16 +171,18 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 						break
 					}
 					}
 
 
-					// More than one loaded model, so we have to see if the new one fits
-					// Update free memory from currently loaded models
-					s.updateFreeSpace(gpus)
-					gpus = pickBestFitGPUs(pending, ggml, gpus)
-					if gpus != nil {
-						slog.Debug("new model fits with existing models, loading")
-						s.loadFn(pending, ggml, gpus)
-						break
+					if runnerToExpire == nil {
+						// More than one loaded model, so we have to see if the new one fits
+						// Update free memory from currently loaded models
+						s.updateFreeSpace(gpus)
+						gpus = pickBestFitGPUs(pending, ggml, gpus)
+						if gpus != nil {
+							slog.Debug("new model fits with existing models, loading")
+							s.loadFn(pending, ggml, gpus)
+							break
+						}
+						runnerToExpire = s.findRunnerToUnload()
 					}
 					}
-					runnerToExpire = s.findRunnerToUnload()
 				}
 				}
 
 
 				if runnerToExpire == nil {
 				if runnerToExpire == nil {
@@ -368,17 +382,11 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 	s.loadedMu.Lock()
 	s.loadedMu.Lock()
 	for _, r := range s.loaded {
 	for _, r := range s.loaded {
 		r.refMu.Lock()
 		r.refMu.Lock()
-		gpuIDs := make([]string, 0, len(r.gpus))
 		if r.llama != nil {
 		if r.llama != nil {
-			// TODO this should be broken down by GPU instead of assuming uniform spread
-			estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
-			for _, gpu := range r.gpus {
-				gpuIDs = append(gpuIDs, gpu.ID)
-			}
 			for _, gpu := range allGpus {
 			for _, gpu := range allGpus {
-				if slices.Contains(gpuIDs, gpu.ID) {
-					predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
-				}
+				// if slices.Contains(gpuIDs, gpu.ID) {
+				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
+				// }
 			}
 			}
 		} else {
 		} else {
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -489,7 +497,8 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 
 
 	// CPU or Metal don't need checking, so no waiting required
 	// CPU or Metal don't need checking, so no waiting required
 	// windows can page VRAM, only cuda currently can report accurate used vram usage
 	// windows can page VRAM, only cuda currently can report accurate used vram usage
-	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
+	if len(runner.gpus) == 0 ||
+		(len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
 		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
 		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
 		finished <- struct{}{}
 		finished <- struct{}{}
 		return finished
 		return finished
@@ -624,3 +633,19 @@ func (s *Scheduler) unloadAllRunners() {
 		}
 		}
 	}
 	}
 }
 }
+
+// If other runners are loaded, make sure the pending request will fit in system memory
+// If not, pick a runner to unload, else return nil and the request can be loaded
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
+	slog.Debug("evaluating if CPU model load will fit in available system memory")
+	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
+	if estimate.TotalSize <= gpus[0].FreeMemory {
+		slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
+		return nil
+	}
+
+	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
+
+	return s.findRunnerToUnload()
+
+}

+ 34 - 26
server/sched_test.go

@@ -60,7 +60,7 @@ func TestLoad(t *testing.T) {
 	err := <-req.errCh
 	err := <-req.errCh
 	require.Contains(t, err.Error(), "this model may be incompatible")
 	require.Contains(t, err.Error(), "this model may be incompatible")
 
 
-	server := &mockLlm{estimatedVRAM: 10}
+	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
@@ -146,7 +146,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		successCh:       make(chan *runnerRef, 1),
 		successCh:       make(chan *runnerRef, 1),
 		errCh:           make(chan error, 1),
 		errCh:           make(chan error, 1),
 	}
 	}
-	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
+	scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
 	return scenario
 	return scenario
 }
 }
 
 
@@ -182,6 +182,12 @@ func TestRequests(t *testing.T) {
 		g.FreeMemory = 12 * format.GigaByte
 		g.FreeMemory = 12 * format.GigaByte
 		return []gpu.GpuInfo{g}
 		return []gpu.GpuInfo{g}
 	}
 	}
+	s.getCpuFn = func() gpu.GpuInfoList {
+		g := gpu.GpuInfo{Library: "cpu"}
+		g.TotalMemory = 32 * format.GigaByte
+		g.FreeMemory = 26 * format.GigaByte
+		return []gpu.GpuInfo{g}
+	}
 	s.newServerFn = scenario1a.newServer
 	s.newServerFn = scenario1a.newServer
 	slog.Info("scenario1a")
 	slog.Info("scenario1a")
 	s.pendingReqCh <- scenario1a.req
 	s.pendingReqCh <- scenario1a.req
@@ -420,7 +426,7 @@ func TestUseLoadedRunner(t *testing.T) {
 		sessionDuration: 2,
 		sessionDuration: 2,
 	}
 	}
 	finished := make(chan *LlmRequest)
 	finished := make(chan *LlmRequest)
-	llm1 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1, sessionDuration: 1}
 	r1 := &runnerRef{llama: llm1, sessionDuration: 1}
 	req.useLoadedRunner(r1, finished)
 	req.useLoadedRunner(r1, finished)
 	require.Equal(t, uint(1), r1.refCount)
 	require.Equal(t, uint(1), r1.refCount)
@@ -453,8 +459,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	gpus[0].FreeMemory = 900
 	gpus[0].FreeMemory = 900
 	gpus[1].TotalMemory = 2000
 	gpus[1].TotalMemory = 2000
 	gpus[1].FreeMemory = 1900
 	gpus[1].FreeMemory = 1900
-	llm1 := &mockLlm{estimatedVRAM: 100}
-	llm2 := &mockLlm{estimatedVRAM: 200}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
 	r1 := &runnerRef{llama: llm1, gpus: gpus}
 	r1 := &runnerRef{llama: llm1, gpus: gpus}
 	r2 := &runnerRef{llama: llm2, gpus: gpus}
 	r2 := &runnerRef{llama: llm2, gpus: gpus}
 
 
@@ -465,8 +471,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 	s.loadedMu.Unlock()
 	s.loadedMu.Unlock()
 
 
 	s.updateFreeSpace(gpus)
 	s.updateFreeSpace(gpus)
-	require.Equal(t, uint64(850), gpus[0].FreeMemory)
-	require.Equal(t, uint64(1850), gpus[1].FreeMemory)
+	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
+	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
 }
 }
 
 
 func TestFindRunnerToUnload(t *testing.T) {
 func TestFindRunnerToUnload(t *testing.T) {
@@ -493,7 +499,7 @@ func TestNeedsReload(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	defer done()
 
 
-	llm := &mockLlm{}
+	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	do := api.DefaultOptions()
 	do := api.DefaultOptions()
 	runner := &runnerRef{
 	runner := &runnerRef{
 		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
 		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
@@ -536,8 +542,8 @@ func TestUnloadAllRunners(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	defer done()
 
 
-	llm1 := &mockLlm{}
-	llm2 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
+	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
 	s.unloadAllRunners()
 	s.unloadAllRunners()
 
 
@@ -555,7 +561,7 @@ func TestUnloadAllRunners(t *testing.T) {
 }
 }
 
 
 func TestUnload(t *testing.T) {
 func TestUnload(t *testing.T) {
-	llm1 := &mockLlm{}
+	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
 	r1 := &runnerRef{llama: llm1}
 	r1 := &runnerRef{llama: llm1}
 	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
 	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
 	r1.unload()
 	r1.unload()
@@ -565,19 +571,20 @@ func TestUnload(t *testing.T) {
 }
 }
 
 
 type mockLlm struct {
 type mockLlm struct {
-	pingResp          error
-	waitResp          error
-	completionResp    error
-	embeddingResp     []float64
-	embeddingRespErr  error
-	tokenizeResp      []int
-	tokenizeRespErr   error
-	detokenizeResp    string
-	detonekizeRespErr error
-	closeResp         error
-	closeCalled       bool
-	estimatedVRAM     uint64
-	estimatedTotal    uint64
+	pingResp           error
+	waitResp           error
+	completionResp     error
+	embeddingResp      []float64
+	embeddingRespErr   error
+	tokenizeResp       []int
+	tokenizeRespErr    error
+	detokenizeResp     string
+	detonekizeRespErr  error
+	closeResp          error
+	closeCalled        bool
+	estimatedVRAM      uint64
+	estimatedTotal     uint64
+	estimatedVRAMByGPU map[string]uint64
 }
 }
 
 
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
 func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
@@ -598,5 +605,6 @@ func (s *mockLlm) Close() error {
 	s.closeCalled = true
 	s.closeCalled = true
 	return s.closeResp
 	return s.closeResp
 }
 }
-func (s *mockLlm) EstimatedVRAM() uint64  { return s.estimatedVRAM }
-func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
+func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
+func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
+func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }