瀏覽代碼

review comments and coverage

Daniel Hiltgen 11 月之前
父節點
當前提交
6f351bf586
共有 18 個文件被更改,包括 374 次插入455 次删除
  1. 9 25
      gpu/amd_linux.go
  2. 2 1
      gpu/amd_windows.go
  3. 2 6
      gpu/cpu_common.go
  4. 25 166
      gpu/gpu.go
  5. 2 2
      gpu/gpu_darwin.go
  6. 0 41
      gpu/gpu_info_cpu.c
  7. 40 60
      gpu/gpu_info_oneapi.c
  8. 24 43
      gpu/gpu_info_oneapi.h
  9. 89 0
      gpu/gpu_linux.go
  10. 55 0
      gpu/gpu_windows.go
  11. 11 22
      gpu/types.go
  12. 2 1
      integration/context_test.go
  13. 24 41
      llm/memory.go
  14. 39 28
      llm/memory_test.go
  15. 8 8
      llm/payload.go
  16. 2 2
      llm/server.go
  17. 3 6
      server/sched.go
  18. 37 3
      server/sched_test.go

+ 9 - 25
gpu/amd_linux.go

@@ -178,7 +178,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 		// Shouldn't happen, but just in case...
 		if gpuID < 0 {
 			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
-			return []RocmGPUInfo{}
+			return nil
 		}
 
 		if int(major) < RocmComputeMin {
@@ -205,22 +205,17 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			matched := true
 			for _, m := range mapping {
 				if m.id == 0 {
+					// Null ID means it didn't populate, so we can't use it to match
 					continue
 				}
 				filename := filepath.Join(devDir, m.filename)
-				fp, err := os.Open(filename)
-				if err != nil {
-					slog.Debug("failed to open sysfs node", "file", filename, "error", err)
-					matched = false
-					break
-				}
-				defer fp.Close()
-				buf, err := io.ReadAll(fp)
+				buf, err := os.ReadFile(filename)
 				if err != nil {
 					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
 					matched = false
 					break
 				}
+				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
 				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
 				if err != nil {
 					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
@@ -239,13 +234,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			// Found the matching DRM directory
 			slog.Debug("matched", "amdgpu", match, "drm", devDir)
 			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
-			totalFp, err := os.Open(totalFile)
-			if err != nil {
-				slog.Debug("failed to open sysfs node", "file", totalFile, "error", err)
-				break
-			}
-			defer totalFp.Close()
-			buf, err := io.ReadAll(totalFp)
+			buf, err := os.ReadFile(totalFile)
 			if err != nil {
 				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
 				break
@@ -284,7 +273,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 					TotalMemory: totalMemory,
 					FreeMemory:  (totalMemory - usedMemory),
 				},
-				ID:            fmt.Sprintf("%d", gpuID),
+				ID:            strconv.Itoa(gpuID),
 				Name:          name,
 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
 				MinimumMemory: rocmMinimumMemory,
@@ -315,7 +304,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 			libDir, err = AMDValidateLibDir()
 			if err != nil {
 				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-				return []RocmGPUInfo{}
+				return nil
 			}
 		}
 		gpuInfo.DependencyPath = libDir
@@ -326,7 +315,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 				supported, err = GetSupportedGFX(libDir)
 				if err != nil {
 					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-					return []RocmGPUInfo{}
+					return nil
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
@@ -434,12 +423,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 }
 
 func getFreeMemory(usedFile string) (uint64, error) {
-	usedFp, err := os.Open(usedFile)
-	if err != nil {
-		return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
-	}
-	defer usedFp.Close()
-	buf, err := io.ReadAll(usedFp)
+	buf, err := os.ReadFile(usedFile)
 	if err != nil {
 		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
 	}

+ 2 - 1
gpu/amd_windows.go

@@ -7,6 +7,7 @@ import (
 	"os"
 	"path/filepath"
 	"slices"
+	"strconv"
 	"strings"
 
 	"github.com/ollama/ollama/format"
@@ -124,7 +125,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 					TotalMemory: totalMemory,
 					FreeMemory:  freeMemory,
 				},
-				ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
+				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
 				DependencyPath: libDir,
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,

+ 2 - 6
gpu/cpu_common.go

@@ -4,11 +4,7 @@ import (
 	"golang.org/x/sys/cpu"
 )
 
-func GetCPUVariant() string {
-	return getCPUCapability().ToVariant()
-}
-
-func getCPUCapability() CPUCapability {
+func GetCPUCapability() CPUCapability {
 	if cpu.X86.HasAVX2 {
 		return CPUCapabilityAVX2
 	}
@@ -16,5 +12,5 @@ func getCPUCapability() CPUCapability {
 		return CPUCapabilityAVX
 	}
 	// else LCD
-	return CPUCapabilityBase
+	return CPUCapabilityNone
 }

+ 25 - 166
gpu/gpu.go

@@ -11,8 +11,6 @@ package gpu
 */
 import "C"
 import (
-	"bufio"
-	"bytes"
 	"fmt"
 	"log/slog"
 	"os"
@@ -66,54 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 
-var CudartLinuxGlobs = []string{
-	"/usr/local/cuda/lib64/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
-	"/usr/lib/wsl/lib/libcudart.so*",
-	"/usr/lib/wsl/drivers/*/libcudart.so*",
-	"/opt/cuda/lib64/libcudart.so*",
-	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
-	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
-	"/usr/local/cuda/lib*/libcudart.so*",
-	"/usr/lib*/libcudart.so*",
-	"/usr/local/lib*/libcudart.so*",
-}
-
-var CudartWindowsGlobs = []string{
-	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
-}
-
-var NvmlWindowsGlobs = []string{
-	"c:\\Windows\\System32\\nvml.dll",
-}
-
-var NvcudaLinuxGlobs = []string{
-	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
-	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
-	"/usr/lib/*-linux-gnu/libcuda.so*",
-	"/usr/lib/wsl/lib/libcuda.so*",
-	"/usr/lib/wsl/drivers/*/libcuda.so*",
-	"/opt/cuda/lib*/libcuda.so*",
-	"/usr/local/cuda/lib*/libcuda.so*",
-	"/usr/lib*/libcuda.so*",
-	"/usr/local/lib*/libcuda.so*",
-}
-
-var NvcudaWindowsGlobs = []string{
-	"c:\\windows\\system*\\nvcuda.dll",
-}
-
-var OneapiWindowsGlobs = []string{
-	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
-}
-
-var OneapiLinuxGlobs = []string{
-	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
-	"/usr/lib*/libze_intel_gpu.so*",
-}
-
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -139,47 +89,24 @@ func initCudaHandles() *cudaHandles {
 	}
 
 	slog.Debug("searching for GPU discovery libraries for NVIDIA")
-	var cudartMgmtName string
 	var cudartMgmtPatterns []string
-	var nvcudaMgmtName string
-	var nvcudaMgmtPatterns []string
-	var nvmlMgmtName string
-	var nvmlMgmtPatterns []string
-
-	tmpDir, _ := PayloadsDir()
-	switch runtime.GOOS {
-	case "windows":
-		cudartMgmtName = "cudart64_*.dll"
-		localAppData := os.Getenv("LOCALAPPDATA")
-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
-		// Aligned with driver, we can't carry as payloads
-		nvcudaMgmtName = "nvcuda.dll"
-		nvcudaMgmtPatterns = NvcudaWindowsGlobs
-
-		// Use nvml to refresh free memory on windows only
-		nvmlMgmtName = "nvml.dll"
-		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
-		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
 
-	case "linux":
-		cudartMgmtName = "libcudart.so*"
-		if tmpDir != "" {
-			// TODO - add "payloads" for subprocess
-			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
-		}
-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
-		// Aligned with driver, we can't carry as payloads
-		nvcudaMgmtName = "libcuda.so*"
-		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+	// Aligned with driver, we can't carry as payloads
+	nvcudaMgmtPatterns := NvcudaGlobs
 
-		// nvml omitted on linux
-	default:
-		return cHandles
+	if runtime.GOOS == "windows" {
+		localAppData := os.Getenv("LOCALAPPDATA")
+		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
+	}
+	tmpDir, _ := PayloadsDir()
+	if tmpDir != "" {
+		// TODO - add "payloads" for subprocess
+		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
 	}
+	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 
-	if len(nvmlMgmtPatterns) > 0 {
-		nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
+	if len(NvmlGlobs) > 0 {
+		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
 		if len(nvmlLibPaths) > 0 {
 			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
 			if nvml != nil {
@@ -190,7 +117,7 @@ func initCudaHandles() *cudaHandles {
 		}
 	}
 
-	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
+	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
 		if nvcuda != nil {
@@ -202,7 +129,7 @@ func initCudaHandles() *cudaHandles {
 		}
 	}
 
-	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
+	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
@@ -220,8 +147,6 @@ func initCudaHandles() *cudaHandles {
 // Note: gpuMutex must already be held
 func initOneAPIHandles() *oneapiHandles {
 	oHandles := &oneapiHandles{}
-	var oneapiMgmtName string
-	var oneapiMgmtPatterns []string
 
 	// Short Circuit if we already know which library to use
 	if oneapiLibPath != "" {
@@ -229,18 +154,7 @@ func initOneAPIHandles() *oneapiHandles {
 		return oHandles
 	}
 
-	switch runtime.GOOS {
-	case "windows":
-		oneapiMgmtName = "ze_intel_gpu64.dll"
-		oneapiMgmtPatterns = OneapiWindowsGlobs
-	case "linux":
-		oneapiMgmtName = "libze_intel_gpu.so"
-		oneapiMgmtPatterns = OneapiLinuxGlobs
-	default:
-		return oHandles
-	}
-
-	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
 	if len(oneapiLibPaths) > 0 {
 		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
 	}
@@ -290,7 +204,7 @@ func GetGPUInfo() GpuInfoList {
 	if !bootstrapped {
 		slog.Debug("Detecting GPUs")
 		needRefresh = false
-		cpuCapability = getCPUCapability()
+		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t
 
 		mem, err := GetCPUMem()
@@ -301,14 +215,14 @@ func GetGPUInfo() GpuInfoList {
 			GpuInfo: GpuInfo{
 				memInfo: mem,
 				Library: "cpu",
-				Variant: cpuCapability.ToVariant(),
+				Variant: cpuCapability,
 				ID:      "0",
 			},
 		}}
 
 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
-			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
+			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
 			bootstrapped = true
 			// No need to do any GPU discovery, since we can't run on them
 			return GpuInfoList{cpus[0].GpuInfo}
@@ -357,8 +271,8 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.MinimumMemory = cudaMinimumMemory
 				gpuInfo.DependencyPath = depPath
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-				gpuInfo.DriverMajor = int(driverMajor)
-				gpuInfo.DriverMinor = int(driverMinor)
+				gpuInfo.DriverMajor = driverMajor
+				gpuInfo.DriverMinor = driverMinor
 
 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
 				cudaGPUs = append(cudaGPUs, gpuInfo)
@@ -374,16 +288,16 @@ func GetGPUInfo() GpuInfoList {
 				continue
 			}
 			devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
-			for i := 0; i < int(devCount); i++ {
+			for i := range devCount {
 				gpuInfo := OneapiGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "oneapi",
 					},
 					driverIndex: d,
-					gpuIndex:    i,
+					gpuIndex:    int(i),
 				}
 				// TODO - split bootstrapping from updating free memory
-				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), C.int(i), &memInfo)
+				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
 				// TODO - convert this to MinimumMemory based on testing...
 				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
 				memInfo.free = C.uint64_t(totalFreeMem)
@@ -505,22 +419,6 @@ func GetGPUInfo() GpuInfoList {
 	return resp
 }
 
-func GetCPUMem() (memInfo, error) {
-	if runtime.GOOS == "linux" {
-		return GetLinuxMemInfo()
-	}
-	var ret memInfo
-	var info C.mem_info_t
-	C.cpu_check_ram(&info)
-	if info.err != nil {
-		defer C.free(unsafe.Pointer(info.err))
-		return ret, fmt.Errorf(C.GoString(info.err))
-	}
-	ret.FreeMemory = uint64(info.free)
-	ret.TotalMemory = uint64(info.total)
-	return ret, nil
-}
-
 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
 	var ldPaths []string
@@ -646,7 +544,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			for i := 0; i < int(resp.oh.num_drivers); i++ {
+			for i := range resp.oh.num_drivers {
 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
 			}
 			return num_devices, &resp.oh, libPath
@@ -682,42 +580,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return "", ""
 	}
 }
-
-func GetLinuxMemInfo() (memInfo, error) {
-	var mem memInfo
-	var total, available, free, buffers, cached uint64
-	f, err := os.Open("/proc/meminfo")
-	if err != nil {
-		return mem, err
-	}
-	defer f.Close()
-	s := bufio.NewScanner(f)
-	for s.Scan() {
-		switch {
-		case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
-			_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
-		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
-			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
-		case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
-			_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
-		case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
-			_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
-		case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
-			_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
-		default:
-			continue
-		}
-		if err != nil {
-			return mem, err
-		}
-
-		if total > 0 && available > 0 {
-			mem.TotalMemory = total * 1024
-			mem.FreeMemory = available * 1024
-			return mem, nil
-		}
-	}
-	mem.TotalMemory = total * 1024
-	mem.FreeMemory = (free + buffers + cached) * 1024
-	return mem, nil
-}

+ 2 - 2
gpu/gpu_darwin.go

@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUVariant(),
+				Variant: GetCPUCapability(),
 				memInfo: mem,
 			},
 		}
@@ -47,7 +47,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUVariant(),
+			Variant: GetCPUCapability(),
 			memInfo: mem,
 		},
 	}

+ 0 - 41
gpu/gpu_info_cpu.c

@@ -1,41 +0,0 @@
-#include "gpu_info.h"
-// Fallbacks for CPU mode
-
-#ifdef _WIN32
-#include <sysinfoapi.h>
-void cpu_check_ram(mem_info_t *resp) {
-  resp->err = NULL;
-  MEMORYSTATUSEX info;
-  info.dwLength = sizeof(info);
-  if (GlobalMemoryStatusEx(&info) != 0) {
-    resp->total = info.ullTotalPhys;
-    resp->free = info.ullAvailPhys;
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
-  } else {
-    resp->err = LOAD_ERR();
-  }
-  return;
-}
-
-#elif __linux__
-#include <errno.h>
-#include <string.h>
-#include <sys/sysinfo.h>
-void cpu_check_ram(mem_info_t *resp) {
-  struct sysinfo info;
-  resp->err = NULL;
-  if (sysinfo(&info) != 0) {
-    resp->err = strdup(strerror(errno));
-  } else {
-    resp->total = info.totalram * info.mem_unit;
-    resp->free = info.freeram * info.mem_unit;
-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
-  }
-  return;
-}
-
-#elif __APPLE__
-// Unused - see gpu_darwin.go
-#else
-#error "Unsupported platform"
-#endif

+ 40 - 60
gpu/gpu_info_oneapi.c

@@ -4,8 +4,7 @@
 
 #include <string.h>
 
-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
-{
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
   ze_result_t ret;
   resp->err = NULL;
   resp->oh.devices = NULL;
@@ -15,8 +14,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   const int buflen = 256;
   char buf[buflen + 1];
   int i, d, count;
-  struct lookup
-  {
+  struct lookup {
     char *s;
     void **p;
   } l[] = {
@@ -32,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   };
 
   resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
-  if (!resp->oh.handle)
-  {
+  if (!resp->oh.handle) {
     char *msg = LOAD_ERR();
     snprintf(buf, buflen,
              "Unable to load %s library to query for Intel GPUs: %s\n",
@@ -48,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
       "wiring Level-Zero management library functions in %s\n",
       oneapi_lib_path);
 
-  for (i = 0; l[i].s != NULL; i++)
-  {
+  for (i = 0; l[i].s != NULL; i++) {
     // TODO once we've squashed the remaining corner cases remove this log
     LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
 
     *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
-    if (!l[i].p)
-    {
+    if (!l[i].p) {
       resp->oh.handle = NULL;
       char *msg = LOAD_ERR();
       LOG(resp->oh.verbose, "dlerr: %s\n", msg);
@@ -68,8 +63,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   }
 
   ret = (*resp->oh.zesInit)(0);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
+  if (ret != ZE_RESULT_SUCCESS) {
     LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
     snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
     resp->err = strdup(buf);
@@ -79,8 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
   count = 0;
   ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
+  if (ret != ZE_RESULT_SUCCESS) {
     LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
     snprintf(buf, buflen, "unable to get driver count: %x", ret);
     resp->err = strdup(buf);
@@ -91,10 +84,10 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
   resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
   memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
-  resp->oh.devices = malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t*));
+  resp->oh.devices =
+      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
   ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
+  if (ret != ZE_RESULT_SUCCESS) {
     LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
     snprintf(buf, buflen, "unable to get driver count: %x", ret);
     resp->err = strdup(buf);
@@ -103,19 +96,20 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   }
 
   for (d = 0; d < resp->oh.num_drivers; d++) {
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL);
-    if (ret != ZE_RESULT_SUCCESS)
-    {
+    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
+                                   &resp->oh.num_devices[d], NULL);
+    if (ret != ZE_RESULT_SUCCESS) {
       LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
       snprintf(buf, buflen, "unable to get device count: %x", ret);
       resp->err = strdup(buf);
       oneapi_release(resp->oh);
       return;
     }
-    resp->oh.devices[d] = malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
-    if (ret != ZE_RESULT_SUCCESS)
-    {
+    resp->oh.devices[d] =
+        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
+    ret = (*resp->oh.zesDeviceGet)(
+        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
+    if (ret != ZE_RESULT_SUCCESS) {
       LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
       snprintf(buf, buflen, "unable to get device count: %x", ret);
       resp->err = strdup(buf);
@@ -128,8 +122,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
   return;
 }
 
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp)
-{
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp) {
   ze_result_t ret;
   resp->err = NULL;
   uint64_t totalMem = 0;
@@ -138,12 +132,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   char buf[buflen + 1];
   int i, d, m;
 
-  if (h.handle == NULL)
-  {
+  if (h.handle == NULL) {
     resp->err = strdup("Level-Zero handle not initialized");
     return;
   }
-  
+
   if (driver > h.num_drivers || device > h.num_devices[driver]) {
     resp->err = strdup("driver of device index out of bounds");
     return;
@@ -161,8 +154,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   props.pNext = &ext_props;
 
   ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
+  if (ret != ZE_RESULT_SUCCESS) {
     snprintf(buf, buflen, "unable to get device properties: %d", ret);
     resp->err = strdup(buf);
     return;
@@ -175,8 +167,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   // TODO - the driver isn't included - what if there are multiple drivers?
   snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
 
-  if (h.verbose)
-  {
+  if (h.verbose) {
     // When in verbose mode, report more information about
     // the card we discover.
     LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
@@ -195,11 +186,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   // Compute Capability equivalent in resp->major, resp->minor, resp->patch
 
   uint32_t memCount = 0;
-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, NULL);
-  if (ret != ZE_RESULT_SUCCESS)
-  {
-    snprintf(buf, buflen,
-              "unable to enumerate Level-Zero memory modules: %x", ret);
+  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
+                                        NULL);
+  if (ret != ZE_RESULT_SUCCESS) {
+    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
+             ret);
     resp->err = strdup(buf);
     return;
   }
@@ -209,14 +200,12 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
   (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
 
-  for (m = 0; m < memCount; m++)
-  {
+  for (m = 0; m < memCount; m++) {
     zes_mem_state_t state;
     state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
     state.pNext = NULL;
     ret = (*h.zesMemoryGetState)(mems[m], &state);
-    if (ret != ZE_RESULT_SUCCESS)
-    {
+    if (ret != ZE_RESULT_SUCCESS) {
       snprintf(buf, buflen, "unable to get memory state: %x", ret);
       resp->err = strdup(buf);
       free(mems);
@@ -230,29 +219,23 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
   free(mems);
 }
 
-void oneapi_release(oneapi_handle_t h)
-{
+void oneapi_release(oneapi_handle_t h) {
   int d;
   LOG(h.verbose, "releasing oneapi library\n");
-  for (d = 0; d < h.num_drivers; d++)
-  {
-    if (h.devices != NULL && h.devices[d] != NULL)
-    {
+  for (d = 0; d < h.num_drivers; d++) {
+    if (h.devices != NULL && h.devices[d] != NULL) {
       free(h.devices[d]);
     }
   }
-  if (h.devices != NULL)
-  {
+  if (h.devices != NULL) {
     free(h.devices);
     h.devices = NULL;
   }
-  if (h.num_devices != NULL)
-  {
+  if (h.num_devices != NULL) {
     free(h.num_devices);
     h.num_devices = NULL;
   }
-  if (h.drivers != NULL)
-  {
+  if (h.drivers != NULL) {
     free(h.drivers);
     h.drivers = NULL;
   }
@@ -261,14 +244,11 @@ void oneapi_release(oneapi_handle_t h)
   h.handle = NULL;
 }
 
-int oneapi_get_device_count(oneapi_handle_t h, int driver) 
-{
-  if (h.handle == NULL || h.num_devices == NULL) 
-  {
+int oneapi_get_device_count(oneapi_handle_t h, int driver) {
+  if (h.handle == NULL || h.num_devices == NULL) {
     return 0;
   }
-  if (driver > h.num_drivers)
-  {
+  if (driver > h.num_drivers) {
     return 0;
   }
   return (int)h.num_devices[driver];

+ 24 - 43
gpu/gpu_info_oneapi.h

@@ -9,8 +9,7 @@
 #define ZE_BIT(_i) (1 << _i)
 
 // Just enough typedef's to dlopen/dlsym for memory information
-typedef enum ze_result_t
-{
+typedef enum ze_result_t {
   ZE_RESULT_SUCCESS = 0,
   // Other values omitted for now...
 } ze_result_t;
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
 typedef struct _zes_device_handle_t *zes_device_handle_t;
 typedef struct _zes_mem_handle_t *zes_mem_handle_t;
 
-typedef enum _ze_structure_type_t
-{
+typedef enum _ze_structure_type_t {
   ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_structure_type_t;
 
-typedef enum _zes_structure_type_t
-{
+typedef enum _zes_structure_type_t {
   ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
   ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
   ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
   ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_structure_type_t;
 
-typedef enum _zes_mem_type_t
-{
+typedef enum _zes_mem_type_t {
   ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
 } zes_mem_type_t;
 
-typedef enum _zes_mem_loc_t
-{
+typedef enum _zes_mem_loc_t {
   ZES_MEM_LOC_SYSTEM = 0,
   ZES_MEM_LOC_DEVICE = 1,
   ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
 } zes_mem_loc_t;
 
-typedef enum _zes_mem_health_t
-{
+typedef enum _zes_mem_health_t {
   ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
 } zes_mem_health_t;
 
-typedef struct _ze_device_uuid_t
-{
+typedef struct _ze_device_uuid_t {
   uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } ze_device_uuid_t;
 
-typedef struct _zes_uuid_t
-{
+typedef struct _zes_uuid_t {
   uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
 } zes_uuid_t;
 
-typedef enum _ze_device_type_t
-{
+typedef enum _ze_device_type_t {
   ZE_DEVICE_TYPE_GPU = 1,
   ZE_DEVICE_TYPE_CPU = 2,
   ZE_DEVICE_TYPE_FPGA = 3,
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
   ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
 } ze_device_type_t;
 
-typedef enum _zes_device_type_t
-{
+typedef enum _zes_device_type_t {
   ZES_DEVICE_TYPE_GPU = 1,
   ZES_DEVICE_TYPE_CPU = 2,
   ZES_DEVICE_TYPE_FPGA = 3,
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
 } zes_device_type_t;
 
 typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t
-{
+typedef enum _ze_device_property_flag_t {
   ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
   ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
   ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
 } ze_device_property_flag_t;
 
 typedef uint32_t zes_device_property_flags_t;
-typedef enum _zes_device_property_flag_t
-{
+typedef enum _zes_device_property_flag_t {
   ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
   ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
   ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
   ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
 } zes_device_property_flag_t;
 
-typedef struct _ze_device_properties_t
-{
+typedef struct _ze_device_properties_t {
   ze_structure_type_t stype;
   void *pNext;
   ze_device_type_t type;
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
   char name[ZE_MAX_DEVICE_NAME];
 } ze_device_properties_t;
 
-typedef struct _zes_device_properties_t
-{
+typedef struct _zes_device_properties_t {
   zes_structure_type_t stype;
   void *pNext;
   ze_device_properties_t core;
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
   char driverVersion[ZES_STRING_PROPERTY_SIZE];
 } zes_device_properties_t;
 
-typedef struct _zes_device_ext_properties_t
-{
+typedef struct _zes_device_ext_properties_t {
   zes_structure_type_t stype;
   void *pNext;
   zes_uuid_t uuid;
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
   zes_device_property_flags_t flags;
 } zes_device_ext_properties_t;
 
-typedef struct _zes_mem_properties_t
-{
+typedef struct _zes_mem_properties_t {
   zes_structure_type_t stype;
   void *pNext;
   zes_mem_type_t type;
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
   int32_t numChannels;
 } zes_mem_properties_t;
 
-typedef struct _zes_mem_state_t
-{
+typedef struct _zes_mem_state_t {
   zes_structure_type_t stype;
   const void *pNext;
   zes_mem_health_t health;
@@ -171,15 +154,14 @@ typedef struct _zes_mem_state_t
   uint64_t size;
 } zes_mem_state_t;
 
-typedef struct oneapi_handle
-{
+typedef struct oneapi_handle {
   void *handle;
   uint16_t verbose;
 
   uint32_t num_drivers;
-  zes_driver_handle_t *drivers; 
+  zes_driver_handle_t *drivers;
   uint32_t *num_devices;
-  zes_device_handle_t **devices; 
+  zes_device_handle_t **devices;
 
   // TODO Driver major, minor information
   // int driver_major;
@@ -201,20 +183,19 @@ typedef struct oneapi_handle
 
 } oneapi_handle_t;
 
-typedef struct oneapi_init_resp
-{
+typedef struct oneapi_init_resp {
   char *err; // If err is non-null handle is invalid
   oneapi_handle_t oh;
 } oneapi_init_resp_t;
 
-typedef struct oneapi_version_resp
-{
+typedef struct oneapi_version_resp {
   ze_result_t status;
   char *str; // Contains version or error string if status != 0
 } oneapi_version_resp_t;
 
 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
-void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp);
+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
+                       mem_info_t *resp);
 void oneapi_release(oneapi_handle_t h);
 int oneapi_get_device_count(oneapi_handle_t h, int driver);
 

+ 89 - 0
gpu/gpu_linux.go

@@ -0,0 +1,89 @@
+package gpu
+
+import (
+	"bufio"
+	"fmt"
+	"os"
+	"strings"
+
+	"github.com/ollama/ollama/format"
+)
+
+var CudartGlobs = []string{
+	"/usr/local/cuda/lib64/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
+	"/usr/lib/wsl/lib/libcudart.so*",
+	"/usr/lib/wsl/drivers/*/libcudart.so*",
+	"/opt/cuda/lib64/libcudart.so*",
+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
+	"/usr/local/cuda/lib*/libcudart.so*",
+	"/usr/lib*/libcudart.so*",
+	"/usr/local/lib*/libcudart.so*",
+}
+
+var NvmlGlobs = []string{}
+
+var NvcudaGlobs = []string{
+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
+	"/usr/lib/*-linux-gnu/libcuda.so*",
+	"/usr/lib/wsl/lib/libcuda.so*",
+	"/usr/lib/wsl/drivers/*/libcuda.so*",
+	"/opt/cuda/lib*/libcuda.so*",
+	"/usr/local/cuda/lib*/libcuda.so*",
+	"/usr/lib*/libcuda.so*",
+	"/usr/local/lib*/libcuda.so*",
+}
+
+var OneapiGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
+var CudartMgmtName = "libcudart.so*"
+var NvcudaMgmtName = "libcuda.so*"
+var NvmlMgmtName = "" // not currently wired on linux
+var OneapiMgmtName = "libze_intel_gpu.so"
+
+func GetCPUMem() (memInfo, error) {
+	var mem memInfo
+	var total, available, free, buffers, cached uint64
+	f, err := os.Open("/proc/meminfo")
+	if err != nil {
+		return mem, err
+	}
+	defer f.Close()
+	s := bufio.NewScanner(f)
+	for s.Scan() {
+		line := s.Text()
+		switch {
+		case strings.HasPrefix(line, "MemTotal:"):
+			_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
+		case strings.HasPrefix(line, "MemAvailable:"):
+			_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
+		case strings.HasPrefix(line, "MemFree:"):
+			_, err = fmt.Sscanf(line, "MemFree:%d", &free)
+		case strings.HasPrefix(line, "Buffers:"):
+			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
+		case strings.HasPrefix(line, "Cached:"):
+			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
+		default:
+			continue
+		}
+		if err != nil {
+			return mem, err
+		}
+
+		if total > 0 && available > 0 {
+			mem.TotalMemory = total * format.KibiByte
+			mem.FreeMemory = available * format.KibiByte
+			return mem, nil
+		}
+	}
+	mem.TotalMemory = total * format.KibiByte
+	mem.FreeMemory = (free + buffers + cached) * format.KibiByte
+	return mem, nil
+}

+ 55 - 0
gpu/gpu_windows.go

@@ -0,0 +1,55 @@
+package gpu
+
+import (
+	"fmt"
+	"syscall"
+	"unsafe"
+)
+
+type MEMORYSTATUSEX struct {
+	length               uint32
+	MemoryLoad           uint32
+	TotalPhys            uint64
+	AvailPhys            uint64
+	TotalPageFile        uint64
+	AvailPageFile        uint64
+	TotalVirtual         uint64
+	AvailVirtual         uint64
+	AvailExtendedVirtual uint64
+}
+
+var (
+	k32                      = syscall.NewLazyDLL("kernel32.dll")
+	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
+	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
+)
+
+var CudartGlobs = []string{
+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
+}
+
+var NvmlGlobs = []string{
+	"c:\\Windows\\System32\\nvml.dll",
+}
+
+var NvcudaGlobs = []string{
+	"c:\\windows\\system*\\nvcuda.dll",
+}
+
+var OneapiGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var CudartMgmtName = "cudart64_*.dll"
+var NvcudaMgmtName = "nvcuda.dll"
+var NvmlMgmtName = "nvml.dll"
+var OneapiMgmtName = "ze_intel_gpu64.dll"
+
+func GetCPUMem() (memInfo, error) {
+	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
+	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
+	if r1 == 0 {
+		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
+	}
+	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
+}

+ 11 - 22
gpu/types.go

@@ -18,7 +18,7 @@ type GpuInfo struct {
 	Library string `json:"library,omitempty"`
 
 	// Optional variant to select (e.g. versions, cpu feature flags)
-	Variant string `json:"variant,omitempty"`
+	Variant CPUCapability `json:"variant"`
 
 	// MinimumMemory represents the minimum memory required to use the GPU
 	MinimumMemory uint64 `json:"-"`
@@ -44,21 +44,21 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	index int // nolint: unused
+	index int //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo
 
 type RocmGPUInfo struct {
 	GpuInfo
-	usedFilepath string // nolint: unused
-	index        int    // nolint: unused
+	usedFilepath string //nolint:unused,nolintlint
+	index        int    //nolint:unused,nolintlint
 }
 type RocmGPUInfoList []RocmGPUInfo
 
 type OneapiGPUInfo struct {
 	GpuInfo
-	driverIndex int // nolint: unused
-	gpuIndex    int // nolint: unused
+	driverIndex int //nolint:unused,nolintlint
+	gpuIndex    int //nolint:unused,nolintlint
 }
 type OneapiGPUInfoList []OneapiGPUInfo
 
@@ -71,8 +71,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != "" {
-			requested += "_" + info.Variant
+		if info.Variant != CPUCapabilityNone {
+			requested += "_" + info.Variant.String()
 		}
 		for i, lib := range libs {
 			if lib == requested {
@@ -117,30 +117,19 @@ type CPUCapability uint32
 var GPURunnerCPUCapability = CPUCapabilityAVX
 
 const (
-	CPUCapabilityBase CPUCapability = iota
+	CPUCapabilityNone CPUCapability = iota
 	CPUCapabilityAVX
 	CPUCapabilityAVX2
 	// TODO AVX512
 )
 
-func (c CPUCapability) ToString() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "AVX"
-	case CPUCapabilityAVX2:
-		return "AVX2"
-	default:
-		return "no vector extensions"
-	}
-}
-
-func (c CPUCapability) ToVariant() string {
+func (c CPUCapability) String() string {
 	switch c {
 	case CPUCapabilityAVX:
 		return "avx"
 	case CPUCapabilityAVX2:
 		return "avx2"
 	default:
-		return ""
+		return "no vector extensions"
 	}
 }

+ 2 - 1
integration/context_test.go

@@ -11,7 +11,8 @@ import (
 )
 
 func TestContextExhaustion(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs
+	// Longer needed for small footprint GPUs
+	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{

+ 24 - 41
llm/memory.go

@@ -1,7 +1,6 @@
 package llm
 
 import (
-	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
-	var includeOutput bool
 
-	// One extra layer as a pad for each GPU
-	var layerBuffer uint64
-
-	// The sizes of the main layers
-	var layerSizes []uint64
+	// The sizes of a layer
+	var layerSize uint64
 
 	// The sum of all the layer sizes (just for logging)
 	var memoryWeights uint64
@@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		layerBuffer = blk0.size()
+		layerSize = blk0.size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
 	}
 
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
 
+	// KV is proportional to the number of layers
+	layerSize += kv / ggml.KV().BlockCount()
+
 	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
@@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
+	} else if len(gpus) > 1 {
+		// multigpu should always use the partial graph size
+		graphFullOffload = graphPartialOffload
 	}
 
 	if layer, ok := layers["output_norm"]; ok {
@@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		memoryLayerOutput += layer.size()
 	}
 
-	if gpus[0].Library == "metal" && opts.UseMMap {
-		includeOutput = true
-	} else if gpus[0].Library != "metal" || !opts.UseMMap {
-		includeOutput = true
-	}
-
+	// Output layer handled at the end if we have space
 	gpuZeroOverhead := projectorSize
-	if includeOutput {
-		gpuZeroOverhead += memoryLayerOutput
-	}
 
 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
 			continue
 		}
 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
-		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
+		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
 	}
 
 	var gpuZeroID int
@@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
 	}
 
-	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
+	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
-		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
-			memoryLayer := blk.size()
-
-			// KV is proportional to the number of layers
-			memoryLayer += kv / ggml.KV().BlockCount()
-			layerSizes[i] = memoryLayer
-			memoryWeights += memoryLayer
-		}
-	}
+		memoryWeights += layerSize
 
-	// For all the layers, find where they can fit on the GPU(s)
-	for i := range layerSizes {
-		if layerSizes[i] == 0 {
-			continue
-		}
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
 			// Stop allocating on GPU(s) once we hit the users target NumGPU
 			continue
@@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+layerSizes[i] {
-				gpuAllocations[g.i] += layerSizes[i]
+			if g.g.FreeMemory > used+layerSize {
+				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
 				break
@@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
-
 	}
 	if layerCount >= int(ggml.KV().BlockCount()) {
 		fullyLoaded = true
 	} else {
 		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
-			overflow += layerSizes[i]
+			overflow += layerSize
 		}
 	}
-	// Find where the output fits
-	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+
+	// Determine if we need to consider output then find where it fits
+	if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
+		memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
@@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				break
 			}
 		}
+
 		if layerCount < int(ggml.KV().BlockCount())+1 {
 			fullyLoaded = false
 			overflow += memoryLayerOutput
@@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	var memoryRequiredPartial, memoryRequiredTotal uint64
 	for i := range gpuAllocations {
 		memoryRequiredPartial += gpuAllocations[i]
-
 	}
 	memoryRequiredTotal = memoryRequiredPartial + overflow
 

+ 39 - 28
llm/memory_test.go

@@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	envconfig.Debug = true
 	modelName := "dummy"
 	f, err := os.CreateTemp(t.TempDir(), modelName)
-	assert.Nil(t, err)
+	require.NoError(t, err)
 	defer f.Close()
 	gguf := NewGGUFV3(binary.LittleEndian)
 	inputLayerCount := 5
@@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
 		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	}
-	assert.Equal(t, inputLayerCount+1, len(tensors))
+	assert.Len(t, tensors, inputLayerCount+1)
 	err = gguf.Encode(f, KV{
 		"general.architecture":          "llama",
 		"general.name":                  "name",
@@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) {
 	}
 	projectors := []string{}
 	opts := api.DefaultOptions()
-	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
-	assert.Equal(t, 0, estimate.Layers)
-	assert.Equal(t, uint64(0), estimate.Graph)
+	t.Run("cpu", func(t *testing.T) {
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		assert.Equal(t, 0, estimate.Layers)
+		assert.Equal(t, uint64(0), estimate.Graph)
+	})
 
 	// derived from the dummy ggml file above
 	graphPartialOffload := uint64(202377216)
@@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) {
 		},
 	}
 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
-	for i, s := range [][]uint64{
+	for i, s := range []struct {
+		layer0, layer1   uint64
+		expect0, expect1 uint64
+	}{
 		{1, 1, 1, 1},
 		{2, 1, 2, 1},
 		{2, 2, 2, 2},
@@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) {
 		{6, 6, 3, 3},
 		{0, 3, 0, 3},
 	} {
-		gpus[0].FreeMemory = 0
-		gpus[1].FreeMemory = 0
-		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
-		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
-		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
-		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
-		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
-		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
-		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
-		var layerSums uint64
-		for _, b := range estimate.GPUSizes {
-			layerSums += b
-		}
-		if estimate.Layers < inputLayerCount+1 {
-			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-		} else {
-			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
-			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
-		}
+		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
+			gpus[0].FreeMemory = 0
+			gpus[1].FreeMemory = 0
+			gpus[0].FreeMemory += projectorSize
+			if s.layer0 > 0 {
+				gpus[0].FreeMemory += memoryLayerOutput
+			} else {
+				gpus[1].FreeMemory += memoryLayerOutput
+			}
+			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
+			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
+			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
+			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
+			var layerSums uint64
+			for _, b := range estimate.GPUSizes {
+				layerSums += b
+			}
+			if estimate.Layers < inputLayerCount+1 {
+				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			} else {
+				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+			}
+		})
 	}
-
 }

+ 8 - 8
llm/payload.go

@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	availableServers := availableServers()
 	requested := info.Library
-	if info.Variant != "" {
-		requested += "_" + info.Variant
+	if info.Variant != gpu.CPUCapabilityNone {
+		requested += "_" + info.Variant.String()
 	}
 
 	servers := []string{}
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
 
 	// Load up the best CPU variant if not primary requested
 	if info.Library != "cpu" {
-		variant := gpu.GetCPUVariant()
+		variant := gpu.GetCPUCapability()
 		// If no variant, then we fall back to default
 		// If we have a variant, try that if we find an exact match
 		// Attempting to run the wrong CPU instructions will panic the
 		// process
-		if variant != "" {
+		if variant != gpu.CPUCapabilityNone {
 			for cmp := range availableServers {
-				if cmp == "cpu_"+variant {
+				if cmp == "cpu_"+variant.String() {
 					servers = append(servers, cmp)
 					break
 				}
@@ -146,11 +146,11 @@ func serverForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 	}
-	variant := gpu.GetCPUVariant()
+	variant := gpu.GetCPUCapability()
 	availableServers := availableServers()
-	if variant != "" {
+	if variant != gpu.CPUCapabilityNone {
 		for cmp := range availableServers {
-			if cmp == "cpu_"+variant {
+			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}

+ 2 - 2
llm/server.go

@@ -39,7 +39,7 @@ type LlamaServer interface {
 	Close() error
 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
 	EstimatedTotal() uint64
-	EstimagedVRAMByGPU(gpuID string) uint64
+	EstimatedVRAMByGPU(gpuID string) uint64
 }
 
 // llmServer is an instance of the llama.cpp server
@@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 {
 	return s.estimate.TotalSize
 }
 
-func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
+func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
 	for i, gpu := range s.gpus {
 		if gpu.ID == gpuID {
 			return s.estimate.GPUSizes[i]

+ 3 - 6
server/sched.go

@@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						// We want to avoid loading on any GPUs that have other
 						// models still loading on them to avoid potential races
 						// with VRAM consumption ramping up during load
-						availGpus := s.filterGPUsWithLoadingModels(gpus)
+						availGpus := s.filterGPUsWithoutLoadingModels(gpus)
 
 						// Update free memory from currently loaded models
 						s.updateFreeSpace(availGpus)
@@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 		r.refMu.Lock()
 		if r.llama != nil {
 			for _, gpu := range allGpus {
-				// if slices.Contains(gpuIDs, gpu.ID) {
-				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
-				// }
+				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
 			}
 		} else {
 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
@@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
 // This routine returns the set of GPUs that do not have an active loading model.
 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
-func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
 	ret := append(gpu.GpuInfoList{}, allGpus...)
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
@@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML,
 	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
 
 	return s.findRunnerToUnload()
-
 }

+ 37 - 3
server/sched_test.go

@@ -156,7 +156,7 @@ func TestRequests(t *testing.T) {
 
 	// Same model, same request
 	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
-	scenario1a.req.sessionDuration = 0
+	scenario1a.req.sessionDuration = 5 * time.Millisecond
 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
 	scenario1b.req.model = scenario1a.req.model
 	scenario1b.ggml = scenario1a.ggml
@@ -167,6 +167,7 @@ func TestRequests(t *testing.T) {
 	tmpModel := *scenario1a.req.model
 	scenario2a.req.model = &tmpModel
 	scenario2a.ggml = scenario1a.ggml
+	scenario2a.req.sessionDuration = 5 * time.Millisecond
 
 	// Multiple loaded models
 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
@@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 
-	// Same model, same request
 	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
 	scenario1a.req.sessionDuration = 0
 	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
@@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) {
 	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
 }
 
+func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer done()
+	gpus := gpu.GpuInfoList{
+		{
+			Library: "cuda",
+			ID:      "0",
+		},
+		{
+			Library: "cuda",
+			ID:      "1",
+		},
+	}
+	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
+
+	s := InitScheduler(ctx)
+	s.loadedMu.Lock()
+	s.loaded["a"] = r1
+	s.loadedMu.Unlock()
+
+	tmp := s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 1)
+	require.Equal(t, "1", tmp[0].ID)
+
+	r1.gpus = gpu.GpuInfoList{gpus[1]}
+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 1)
+	require.Equal(t, "0", tmp[0].ID)
+
+	r1.gpus = gpu.GpuInfoList{}
+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
+	require.Len(t, tmp, 2)
+}
+
 func TestFindRunnerToUnload(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
@@ -607,4 +641,4 @@ func (s *mockLlm) Close() error {
 }
 func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
 func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
-func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
+func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }