11 月之前 · 6f351bf586
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -178,7 +178,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 		// Shouldn't happen, but just in case...
			
 
				 		if gpuID < 0 {
			
 
				 			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
			
 
				-			return []RocmGPUInfo{}
			
 
				+			return nil
			
 
				 		}
			
 
				 
			
 
				 		if int(major) < RocmComputeMin {
			
@@ -205,22 +205,17 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			matched := true
			
 
				 			for _, m := range mapping {
			
 
				 				if m.id == 0 {
			
 
				+					// Null ID means it didn't populate, so we can't use it to match
			
 
				 					continue
			
 
				 				}
			
 
				 				filename := filepath.Join(devDir, m.filename)
			
 
				-				fp, err := os.Open(filename)
			
 
				-				if err != nil {
			
 
				-					slog.Debug("failed to open sysfs node", "file", filename, "error", err)
			
 
				-					matched = false
			
 
				-					break
			
 
				-				}
			
 
				-				defer fp.Close()
			
 
				-				buf, err := io.ReadAll(fp)
			
 
				+				buf, err := os.ReadFile(filename)
			
 
				 				if err != nil {
			
 
				 					slog.Debug("failed to read sysfs node", "file", filename, "error", err)
			
 
				 					matched = false
			
 
				 					break
			
 
				 				}
			
 
				+				// values here are in hex, strip off the lead 0x and parse so we can compare the numeric (decimal) values in amdgpu
			
 
				 				cmp, err := strconv.ParseUint(strings.TrimPrefix(strings.TrimSpace(string(buf)), "0x"), 16, 64)
			
 
				 				if err != nil {
			
 
				 					slog.Debug("failed to parse sysfs node", "file", filename, "error", err)
			
@@ -239,13 +234,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			// Found the matching DRM directory
			
 
				 			slog.Debug("matched", "amdgpu", match, "drm", devDir)
			
 
				 			totalFile := filepath.Join(devDir, DRMTotalMemoryFile)
			
 
				-			totalFp, err := os.Open(totalFile)
			
 
				-			if err != nil {
			
 
				-				slog.Debug("failed to open sysfs node", "file", totalFile, "error", err)
			
 
				-				break
			
 
				-			}
			
 
				-			defer totalFp.Close()
			
 
				-			buf, err := io.ReadAll(totalFp)
			
 
				+			buf, err := os.ReadFile(totalFile)
			
 
				 			if err != nil {
			
 
				 				slog.Debug("failed to read sysfs node", "file", totalFile, "error", err)
			
 
				 				break
			
@@ -284,7 +273,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 					TotalMemory: totalMemory,
			
 
				 					FreeMemory:  (totalMemory - usedMemory),
			
 
				 				},
			
 
				-				ID:            fmt.Sprintf("%d", gpuID),
			
 
				+				ID:            strconv.Itoa(gpuID),
			
 
				 				Name:          name,
			
 
				 				Compute:       fmt.Sprintf("gfx%d%x%x", major, minor, patch),
			
 
				 				MinimumMemory: rocmMinimumMemory,
			
@@ -315,7 +304,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			libDir, err = AMDValidateLibDir()
			
 
				 			if err != nil {
			
 
				 				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
			
 
				-				return []RocmGPUInfo{}
			
 
				+				return nil
			
 
				 			}
			
 
				 		}
			
 
				 		gpuInfo.DependencyPath = libDir
			
@@ -326,7 +315,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 				supported, err = GetSupportedGFX(libDir)
			
 
				 				if err != nil {
			
 
				 					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			
 
				-					return []RocmGPUInfo{}
			
 
				+					return nil
			
 
				 				}
			
 
				 				slog.Debug("rocm supported GPUs", "types", supported)
			
 
				 			}
			
@@ -434,12 +423,7 @@ func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
 
				 }
			
 
				 
			
 
				 func getFreeMemory(usedFile string) (uint64, error) {
			
 
				-	usedFp, err := os.Open(usedFile)
			
 
				-	if err != nil {
			
 
				-		return 0, fmt.Errorf("failed to open sysfs node %s %w", usedFile, err)
			
 
				-	}
			
 
				-	defer usedFp.Close()
			
 
				-	buf, err := io.ReadAll(usedFp)
			
 
				+	buf, err := os.ReadFile(usedFile)
			
 
				 	if err != nil {
			
 
				 		return 0, fmt.Errorf("failed to read sysfs node %s %w", usedFile, err)
			
 
				 	}
			
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -7,6 +7,7 @@ import (
 
				 	"os"
			
 
				 	"path/filepath"
			
 
				 	"slices"
			
 
				+	"strconv"
			
 
				 	"strings"
			
 
				 
			
 
				 	"github.com/ollama/ollama/format"
			
@@ -124,7 +125,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 					TotalMemory: totalMemory,
			
 
				 					FreeMemory:  freeMemory,
			
 
				 				},
			
 
				-				ID:             fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
			
 
				+				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
			
 
				 				DependencyPath: libDir,
			
 
				 				MinimumMemory:  rocmMinimumMemory,
			
 
				 				Name:           name,
			
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -4,11 +4,7 @@ import (
 
				 	"golang.org/x/sys/cpu"
			
 
				 )
			
 
				 
			
 
				-func GetCPUVariant() string {
			
 
				-	return getCPUCapability().ToVariant()
			
 
				-}
			
 
				-
			
 
				-func getCPUCapability() CPUCapability {
			
 
				+func GetCPUCapability() CPUCapability {
			
 
				 	if cpu.X86.HasAVX2 {
			
 
				 		return CPUCapabilityAVX2
			
 
				 	}
			
@@ -16,5 +12,5 @@ func getCPUCapability() CPUCapability {
 
				 		return CPUCapabilityAVX
			
 
				 	}
			
 
				 	// else LCD
			
 
				-	return CPUCapabilityBase
			
 
				+	return CPUCapabilityNone
			
 
				 }
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -11,8 +11,6 @@ package gpu
 
				 */
			
 
				 import "C"
			
 
				 import (
			
 
				-	"bufio"
			
 
				-	"bytes"
			
 
				 	"fmt"
			
 
				 	"log/slog"
			
 
				 	"os"
			
@@ -66,54 +64,6 @@ var RocmComputeMin = 9
 
				 // TODO find a better way to detect iGPU instead of minimum memory
			
 
				 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
			
 
				 
			
 
				-var CudartLinuxGlobs = []string{
			
 
				-	"/usr/local/cuda/lib64/libcudart.so*",
			
 
				-	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				-	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
			
 
				-	"/usr/lib/wsl/lib/libcudart.so*",
			
 
				-	"/usr/lib/wsl/drivers/*/libcudart.so*",
			
 
				-	"/opt/cuda/lib64/libcudart.so*",
			
 
				-	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
			
 
				-	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				-	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
			
 
				-	"/usr/local/cuda/lib*/libcudart.so*",
			
 
				-	"/usr/lib*/libcudart.so*",
			
 
				-	"/usr/local/lib*/libcudart.so*",
			
 
				-}
			
 
				-
			
 
				-var CudartWindowsGlobs = []string{
			
 
				-	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
			
 
				-}
			
 
				-
			
 
				-var NvmlWindowsGlobs = []string{
			
 
				-	"c:\\Windows\\System32\\nvml.dll",
			
 
				-}
			
 
				-
			
 
				-var NvcudaLinuxGlobs = []string{
			
 
				-	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
			
 
				-	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
			
 
				-	"/usr/lib/*-linux-gnu/libcuda.so*",
			
 
				-	"/usr/lib/wsl/lib/libcuda.so*",
			
 
				-	"/usr/lib/wsl/drivers/*/libcuda.so*",
			
 
				-	"/opt/cuda/lib*/libcuda.so*",
			
 
				-	"/usr/local/cuda/lib*/libcuda.so*",
			
 
				-	"/usr/lib*/libcuda.so*",
			
 
				-	"/usr/local/lib*/libcuda.so*",
			
 
				-}
			
 
				-
			
 
				-var NvcudaWindowsGlobs = []string{
			
 
				-	"c:\\windows\\system*\\nvcuda.dll",
			
 
				-}
			
 
				-
			
 
				-var OneapiWindowsGlobs = []string{
			
 
				-	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
			
 
				-}
			
 
				-
			
 
				-var OneapiLinuxGlobs = []string{
			
 
				-	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
			
 
				-	"/usr/lib*/libze_intel_gpu.so*",
			
 
				-}
			
 
				-
			
 
				 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
			
 
				 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
			
 
				 var CudaTegra string = os.Getenv("JETSON_JETPACK")
			
@@ -139,47 +89,24 @@ func initCudaHandles() *cudaHandles {
 
				 	}
			
 
				 
			
 
				 	slog.Debug("searching for GPU discovery libraries for NVIDIA")
			
 
				-	var cudartMgmtName string
			
 
				 	var cudartMgmtPatterns []string
			
 
				-	var nvcudaMgmtName string
			
 
				-	var nvcudaMgmtPatterns []string
			
 
				-	var nvmlMgmtName string
			
 
				-	var nvmlMgmtPatterns []string
			
 
				-
			
 
				-	tmpDir, _ := PayloadsDir()
			
 
				-	switch runtime.GOOS {
			
 
				-	case "windows":
			
 
				-		cudartMgmtName = "cudart64_*.dll"
			
 
				-		localAppData := os.Getenv("LOCALAPPDATA")
			
 
				-		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", cudartMgmtName)}
			
 
				-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartWindowsGlobs...)
			
 
				-		// Aligned with driver, we can't carry as payloads
			
 
				-		nvcudaMgmtName = "nvcuda.dll"
			
 
				-		nvcudaMgmtPatterns = NvcudaWindowsGlobs
			
 
				-
			
 
				-		// Use nvml to refresh free memory on windows only
			
 
				-		nvmlMgmtName = "nvml.dll"
			
 
				-		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
			
 
				-		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
			
 
				 
			
 
				-	case "linux":
			
 
				-		cudartMgmtName = "libcudart.so*"
			
 
				-		if tmpDir != "" {
			
 
				-			// TODO - add "payloads" for subprocess
			
 
				-			cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", cudartMgmtName)}
			
 
				-		}
			
 
				-		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
			
 
				-		// Aligned with driver, we can't carry as payloads
			
 
				-		nvcudaMgmtName = "libcuda.so*"
			
 
				-		nvcudaMgmtPatterns = NvcudaLinuxGlobs
			
 
				+	// Aligned with driver, we can't carry as payloads
			
 
				+	nvcudaMgmtPatterns := NvcudaGlobs
			
 
				 
			
 
				-		// nvml omitted on linux
			
 
				-	default:
			
 
				-		return cHandles
			
 
				+	if runtime.GOOS == "windows" {
			
 
				+		localAppData := os.Getenv("LOCALAPPDATA")
			
 
				+		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
			
 
				+	}
			
 
				+	tmpDir, _ := PayloadsDir()
			
 
				+	if tmpDir != "" {
			
 
				+		// TODO - add "payloads" for subprocess
			
 
				+		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
			
 
				 	}
			
 
				+	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
			
 
				 
			
 
				-	if len(nvmlMgmtPatterns) > 0 {
			
 
				-		nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
			
 
				+	if len(NvmlGlobs) > 0 {
			
 
				+		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
			
 
				 		if len(nvmlLibPaths) > 0 {
			
 
				 			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
			
 
				 			if nvml != nil {
			
@@ -190,7 +117,7 @@ func initCudaHandles() *cudaHandles {
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
			
 
				+	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
			
 
				 	if len(nvcudaLibPaths) > 0 {
			
 
				 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
			
 
				 		if nvcuda != nil {
			
@@ -202,7 +129,7 @@ func initCudaHandles() *cudaHandles {
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	cudartLibPaths := FindGPULibs(cudartMgmtName, cudartMgmtPatterns)
			
 
				+	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
			
 
				 	if len(cudartLibPaths) > 0 {
			
 
				 		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
			
 
				 		if cudart != nil {
			
@@ -220,8 +147,6 @@ func initCudaHandles() *cudaHandles {
 
				 // Note: gpuMutex must already be held
			
 
				 func initOneAPIHandles() *oneapiHandles {
			
 
				 	oHandles := &oneapiHandles{}
			
 
				-	var oneapiMgmtName string
			
 
				-	var oneapiMgmtPatterns []string
			
 
				 
			
 
				 	// Short Circuit if we already know which library to use
			
 
				 	if oneapiLibPath != "" {
			
@@ -229,18 +154,7 @@ func initOneAPIHandles() *oneapiHandles {
 
				 		return oHandles
			
 
				 	}
			
 
				 
			
 
				-	switch runtime.GOOS {
			
 
				-	case "windows":
			
 
				-		oneapiMgmtName = "ze_intel_gpu64.dll"
			
 
				-		oneapiMgmtPatterns = OneapiWindowsGlobs
			
 
				-	case "linux":
			
 
				-		oneapiMgmtName = "libze_intel_gpu.so"
			
 
				-		oneapiMgmtPatterns = OneapiLinuxGlobs
			
 
				-	default:
			
 
				-		return oHandles
			
 
				-	}
			
 
				-
			
 
				-	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
			
 
				+	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
			
 
				 	if len(oneapiLibPaths) > 0 {
			
 
				 		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
			
 
				 	}
			
@@ -290,7 +204,7 @@ func GetGPUInfo() GpuInfoList {
 
				 	if !bootstrapped {
			
 
				 		slog.Debug("Detecting GPUs")
			
 
				 		needRefresh = false
			
 
				-		cpuCapability = getCPUCapability()
			
 
				+		cpuCapability = GetCPUCapability()
			
 
				 		var memInfo C.mem_info_t
			
 
				 
			
 
				 		mem, err := GetCPUMem()
			
@@ -301,14 +215,14 @@ func GetGPUInfo() GpuInfoList {
 
				 			GpuInfo: GpuInfo{
			
 
				 				memInfo: mem,
			
 
				 				Library: "cpu",
			
 
				-				Variant: cpuCapability.ToVariant(),
			
 
				+				Variant: cpuCapability,
			
 
				 				ID:      "0",
			
 
				 			},
			
 
				 		}}
			
 
				 
			
 
				 		// Fallback to CPU mode if we're lacking required vector extensions on x86
			
 
				 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
			
 
				-			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability.ToString(), "detected", cpuCapability.ToString())
			
 
				+			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
			
 
				 			bootstrapped = true
			
 
				 			// No need to do any GPU discovery, since we can't run on them
			
 
				 			return GpuInfoList{cpus[0].GpuInfo}
			
@@ -357,8 +271,8 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo.MinimumMemory = cudaMinimumMemory
			
 
				 				gpuInfo.DependencyPath = depPath
			
 
				 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
			
 
				-				gpuInfo.DriverMajor = int(driverMajor)
			
 
				-				gpuInfo.DriverMinor = int(driverMinor)
			
 
				+				gpuInfo.DriverMajor = driverMajor
			
 
				+				gpuInfo.DriverMinor = driverMinor
			
 
				 
			
 
				 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
			
 
				 				cudaGPUs = append(cudaGPUs, gpuInfo)
			
@@ -374,16 +288,16 @@ func GetGPUInfo() GpuInfoList {
 
				 				continue
			
 
				 			}
			
 
				 			devCount := C.oneapi_get_device_count(*oHandles.oneapi, C.int(d))
			
 
				-			for i := 0; i < int(devCount); i++ {
			
 
				+			for i := range devCount {
			
 
				 				gpuInfo := OneapiGPUInfo{
			
 
				 					GpuInfo: GpuInfo{
			
 
				 						Library: "oneapi",
			
 
				 					},
			
 
				 					driverIndex: d,
			
 
				-					gpuIndex:    i,
			
 
				+					gpuIndex:    int(i),
			
 
				 				}
			
 
				 				// TODO - split bootstrapping from updating free memory
			
 
				-				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), C.int(i), &memInfo)
			
 
				+				C.oneapi_check_vram(*oHandles.oneapi, C.int(d), i, &memInfo)
			
 
				 				// TODO - convert this to MinimumMemory based on testing...
			
 
				 				var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
			
 
				 				memInfo.free = C.uint64_t(totalFreeMem)
			
@@ -505,22 +419,6 @@ func GetGPUInfo() GpuInfoList {
 
				 	return resp
			
 
				 }
			
 
				 
			
 
				-func GetCPUMem() (memInfo, error) {
			
 
				-	if runtime.GOOS == "linux" {
			
 
				-		return GetLinuxMemInfo()
			
 
				-	}
			
 
				-	var ret memInfo
			
 
				-	var info C.mem_info_t
			
 
				-	C.cpu_check_ram(&info)
			
 
				-	if info.err != nil {
			
 
				-		defer C.free(unsafe.Pointer(info.err))
			
 
				-		return ret, fmt.Errorf(C.GoString(info.err))
			
 
				-	}
			
 
				-	ret.FreeMemory = uint64(info.free)
			
 
				-	ret.TotalMemory = uint64(info.total)
			
 
				-	return ret, nil
			
 
				-}
			
 
				-
			
 
				 func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
			
 
				 	// Multiple GPU libraries may exist, and some may not work, so keep trying until we exhaust them
			
 
				 	var ldPaths []string
			
@@ -646,7 +544,7 @@ func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 
				 			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				-			for i := 0; i < int(resp.oh.num_drivers); i++ {
			
 
				+			for i := range resp.oh.num_drivers {
			
 
				 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
			
 
				 			}
			
 
				 			return num_devices, &resp.oh, libPath
			
@@ -682,42 +580,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 
				 		return "", ""
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-func GetLinuxMemInfo() (memInfo, error) {
			
 
				-	var mem memInfo
			
 
				-	var total, available, free, buffers, cached uint64
			
 
				-	f, err := os.Open("/proc/meminfo")
			
 
				-	if err != nil {
			
 
				-		return mem, err
			
 
				-	}
			
 
				-	defer f.Close()
			
 
				-	s := bufio.NewScanner(f)
			
 
				-	for s.Scan() {
			
 
				-		switch {
			
 
				-		case bytes.HasPrefix(s.Bytes(), []byte(`MemTotal:`)):
			
 
				-			_, err = fmt.Sscanf(s.Text(), "MemTotal:%d", &total)
			
 
				-		case bytes.HasPrefix(s.Bytes(), []byte(`MemAvailable:`)):
			
 
				-			_, err = fmt.Sscanf(s.Text(), "MemAvailable:%d", &available)
			
 
				-		case bytes.HasPrefix(s.Bytes(), []byte(`MemFree:`)):
			
 
				-			_, err = fmt.Sscanf(s.Text(), "MemFree:%d", &free)
			
 
				-		case bytes.HasPrefix(s.Bytes(), []byte(`Buffers:`)):
			
 
				-			_, err = fmt.Sscanf(s.Text(), "Buffers:%d", &buffers)
			
 
				-		case bytes.HasPrefix(s.Bytes(), []byte(`Cached:`)):
			
 
				-			_, err = fmt.Sscanf(s.Text(), "Cached:%d", &cached)
			
 
				-		default:
			
 
				-			continue
			
 
				-		}
			
 
				-		if err != nil {
			
 
				-			return mem, err
			
 
				-		}
			
 
				-
			
 
				-		if total > 0 && available > 0 {
			
 
				-			mem.TotalMemory = total * 1024
			
 
				-			mem.FreeMemory = available * 1024
			
 
				-			return mem, nil
			
 
				-		}
			
 
				-	}
			
 
				-	mem.TotalMemory = total * 1024
			
 
				-	mem.FreeMemory = (free + buffers + cached) * 1024
			
 
				-	return mem, nil
			
 
				-}
			
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -24,7 +24,7 @@ func GetGPUInfo() GpuInfoList {
 
				 		return []GpuInfo{
			
 
				 			{
			
 
				 				Library: "cpu",
			
 
				-				Variant: GetCPUVariant(),
			
 
				+				Variant: GetCPUCapability(),
			
 
				 				memInfo: mem,
			
 
				 			},
			
 
				 		}
			
@@ -47,7 +47,7 @@ func GetCPUInfo() GpuInfoList {
 
				 	return []GpuInfo{
			
 
				 		{
			
 
				 			Library: "cpu",
			
 
				-			Variant: GetCPUVariant(),
			
 
				+			Variant: GetCPUCapability(),
			
 
				 			memInfo: mem,
			
 
				 		},
			
 
				 	}
			
--- a/gpu/gpu_info_cpu.c
+++ b/gpu/gpu_info_cpu.c
@@ -1,41 +0,0 @@
 
				-#include "gpu_info.h"
			
 
				-// Fallbacks for CPU mode
			
 
				-
			
 
				-#ifdef _WIN32
			
 
				-#include <sysinfoapi.h>
			
 
				-void cpu_check_ram(mem_info_t *resp) {
			
 
				-  resp->err = NULL;
			
 
				-  MEMORYSTATUSEX info;
			
 
				-  info.dwLength = sizeof(info);
			
 
				-  if (GlobalMemoryStatusEx(&info) != 0) {
			
 
				-    resp->total = info.ullTotalPhys;
			
 
				-    resp->free = info.ullAvailPhys;
			
 
				-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
			
 
				-  } else {
			
 
				-    resp->err = LOAD_ERR();
			
 
				-  }
			
 
				-  return;
			
 
				-}
			
 
				-
			
 
				-#elif __linux__
			
 
				-#include <errno.h>
			
 
				-#include <string.h>
			
 
				-#include <sys/sysinfo.h>
			
 
				-void cpu_check_ram(mem_info_t *resp) {
			
 
				-  struct sysinfo info;
			
 
				-  resp->err = NULL;
			
 
				-  if (sysinfo(&info) != 0) {
			
 
				-    resp->err = strdup(strerror(errno));
			
 
				-  } else {
			
 
				-    resp->total = info.totalram * info.mem_unit;
			
 
				-    resp->free = info.freeram * info.mem_unit;
			
 
				-    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "0");
			
 
				-  }
			
 
				-  return;
			
 
				-}
			
 
				-
			
 
				-#elif __APPLE__
			
 
				-// Unused - see gpu_darwin.go
			
 
				-#else
			
 
				-#error "Unsupported platform"
			
 
				-#endif
			
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -4,8 +4,7 @@
 
				 
			
 
				 #include <string.h>
			
 
				 
			
 
				-void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
			
 
				-{
			
 
				+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp) {
			
 
				   ze_result_t ret;
			
 
				   resp->err = NULL;
			
 
				   resp->oh.devices = NULL;
			
@@ -15,8 +14,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   const int buflen = 256;
			
 
				   char buf[buflen + 1];
			
 
				   int i, d, count;
			
 
				-  struct lookup
			
 
				-  {
			
 
				+  struct lookup {
			
 
				     char *s;
			
 
				     void **p;
			
 
				   } l[] = {
			
@@ -32,8 +30,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   };
			
 
				 
			
 
				   resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
			
 
				-  if (!resp->oh.handle)
			
 
				-  {
			
 
				+  if (!resp->oh.handle) {
			
 
				     char *msg = LOAD_ERR();
			
 
				     snprintf(buf, buflen,
			
 
				              "Unable to load %s library to query for Intel GPUs: %s\n",
			
@@ -48,14 +45,12 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				       "wiring Level-Zero management library functions in %s\n",
			
 
				       oneapi_lib_path);
			
 
				 
			
 
				-  for (i = 0; l[i].s != NULL; i++)
			
 
				-  {
			
 
				+  for (i = 0; l[i].s != NULL; i++) {
			
 
				     // TODO once we've squashed the remaining corner cases remove this log
			
 
				     LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
			
 
				 
			
 
				     *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
			
 
				-    if (!l[i].p)
			
 
				-    {
			
 
				+    if (!l[i].p) {
			
 
				       resp->oh.handle = NULL;
			
 
				       char *msg = LOAD_ERR();
			
 
				       LOG(resp->oh.verbose, "dlerr: %s\n", msg);
			
@@ -68,8 +63,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   }
			
 
				 
			
 
				   ret = (*resp->oh.zesInit)(0);
			
 
				-  if (ret != ZE_RESULT_SUCCESS)
			
 
				-  {
			
 
				+  if (ret != ZE_RESULT_SUCCESS) {
			
 
				     LOG(resp->oh.verbose, "zesInit err: %x\n", ret);
			
 
				     snprintf(buf, buflen, "oneapi vram init failure: %x", ret);
			
 
				     resp->err = strdup(buf);
			
@@ -79,8 +73,7 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				 
			
 
				   count = 0;
			
 
				   ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, NULL);
			
 
				-  if (ret != ZE_RESULT_SUCCESS)
			
 
				-  {
			
 
				+  if (ret != ZE_RESULT_SUCCESS) {
			
 
				     LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
			
 
				     snprintf(buf, buflen, "unable to get driver count: %x", ret);
			
 
				     resp->err = strdup(buf);
			
@@ -91,10 +84,10 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   resp->oh.drivers = malloc(resp->oh.num_drivers * sizeof(zes_driver_handle_t));
			
 
				   resp->oh.num_devices = malloc(resp->oh.num_drivers * sizeof(uint32_t));
			
 
				   memset(&resp->oh.num_devices[0], 0, resp->oh.num_drivers * sizeof(uint32_t));
			
 
				-  resp->oh.devices = malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t*));
			
 
				+  resp->oh.devices =
			
 
				+      malloc(resp->oh.num_drivers * sizeof(zes_device_handle_t *));
			
 
				   ret = (*resp->oh.zesDriverGet)(&resp->oh.num_drivers, &resp->oh.drivers[0]);
			
 
				-  if (ret != ZE_RESULT_SUCCESS)
			
 
				-  {
			
 
				+  if (ret != ZE_RESULT_SUCCESS) {
			
 
				     LOG(resp->oh.verbose, "zesDriverGet err: %x\n", ret);
			
 
				     snprintf(buf, buflen, "unable to get driver count: %x", ret);
			
 
				     resp->err = strdup(buf);
			
@@ -103,19 +96,20 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   }
			
 
				 
			
 
				   for (d = 0; d < resp->oh.num_drivers; d++) {
			
 
				-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], NULL);
			
 
				-    if (ret != ZE_RESULT_SUCCESS)
			
 
				-    {
			
 
				+    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d],
			
 
				+                                   &resp->oh.num_devices[d], NULL);
			
 
				+    if (ret != ZE_RESULT_SUCCESS) {
			
 
				       LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
			
 
				       snprintf(buf, buflen, "unable to get device count: %x", ret);
			
 
				       resp->err = strdup(buf);
			
 
				       oneapi_release(resp->oh);
			
 
				       return;
			
 
				     }
			
 
				-    resp->oh.devices[d] = malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
			
 
				-    ret = (*resp->oh.zesDeviceGet)(resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
			
 
				-    if (ret != ZE_RESULT_SUCCESS)
			
 
				-    {
			
 
				+    resp->oh.devices[d] =
			
 
				+        malloc(resp->oh.num_devices[d] * sizeof(zes_device_handle_t));
			
 
				+    ret = (*resp->oh.zesDeviceGet)(
			
 
				+        resp->oh.drivers[d], &resp->oh.num_devices[d], resp->oh.devices[d]);
			
 
				+    if (ret != ZE_RESULT_SUCCESS) {
			
 
				       LOG(resp->oh.verbose, "zesDeviceGet err: %x\n", ret);
			
 
				       snprintf(buf, buflen, "unable to get device count: %x", ret);
			
 
				       resp->err = strdup(buf);
			
@@ -128,8 +122,8 @@ void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
 
				   return;
			
 
				 }
			
 
				 
			
 
				-void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp)
			
 
				-{
			
 
				+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
			
 
				+                       mem_info_t *resp) {
			
 
				   ze_result_t ret;
			
 
				   resp->err = NULL;
			
 
				   uint64_t totalMem = 0;
			
@@ -138,12 +132,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   char buf[buflen + 1];
			
 
				   int i, d, m;
			
 
				 
			
 
				-  if (h.handle == NULL)
			
 
				-  {
			
 
				+  if (h.handle == NULL) {
			
 
				     resp->err = strdup("Level-Zero handle not initialized");
			
 
				     return;
			
 
				   }
			
 
				-  
			
 
				+
			
 
				   if (driver > h.num_drivers || device > h.num_devices[driver]) {
			
 
				     resp->err = strdup("driver of device index out of bounds");
			
 
				     return;
			
@@ -161,8 +154,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   props.pNext = &ext_props;
			
 
				 
			
 
				   ret = (*h.zesDeviceGetProperties)(h.devices[driver][device], &props);
			
 
				-  if (ret != ZE_RESULT_SUCCESS)
			
 
				-  {
			
 
				+  if (ret != ZE_RESULT_SUCCESS) {
			
 
				     snprintf(buf, buflen, "unable to get device properties: %d", ret);
			
 
				     resp->err = strdup(buf);
			
 
				     return;
			
@@ -175,8 +167,7 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   // TODO - the driver isn't included - what if there are multiple drivers?
			
 
				   snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device);
			
 
				 
			
 
				-  if (h.verbose)
			
 
				-  {
			
 
				+  if (h.verbose) {
			
 
				     // When in verbose mode, report more information about
			
 
				     // the card we discover.
			
 
				     LOG(h.verbose, "[%d:%d] oneAPI device name: %s\n", driver, device,
			
@@ -195,11 +186,11 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   // Compute Capability equivalent in resp->major, resp->minor, resp->patch
			
 
				 
			
 
				   uint32_t memCount = 0;
			
 
				-  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, NULL);
			
 
				-  if (ret != ZE_RESULT_SUCCESS)
			
 
				-  {
			
 
				-    snprintf(buf, buflen,
			
 
				-              "unable to enumerate Level-Zero memory modules: %x", ret);
			
 
				+  ret = (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount,
			
 
				+                                        NULL);
			
 
				+  if (ret != ZE_RESULT_SUCCESS) {
			
 
				+    snprintf(buf, buflen, "unable to enumerate Level-Zero memory modules: %x",
			
 
				+             ret);
			
 
				     resp->err = strdup(buf);
			
 
				     return;
			
 
				   }
			
@@ -209,14 +200,12 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
			
 
				   (*h.zesDeviceEnumMemoryModules)(h.devices[driver][device], &memCount, mems);
			
 
				 
			
 
				-  for (m = 0; m < memCount; m++)
			
 
				-  {
			
 
				+  for (m = 0; m < memCount; m++) {
			
 
				     zes_mem_state_t state;
			
 
				     state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
			
 
				     state.pNext = NULL;
			
 
				     ret = (*h.zesMemoryGetState)(mems[m], &state);
			
 
				-    if (ret != ZE_RESULT_SUCCESS)
			
 
				-    {
			
 
				+    if (ret != ZE_RESULT_SUCCESS) {
			
 
				       snprintf(buf, buflen, "unable to get memory state: %x", ret);
			
 
				       resp->err = strdup(buf);
			
 
				       free(mems);
			
@@ -230,29 +219,23 @@ void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *re
 
				   free(mems);
			
 
				 }
			
 
				 
			
 
				-void oneapi_release(oneapi_handle_t h)
			
 
				-{
			
 
				+void oneapi_release(oneapi_handle_t h) {
			
 
				   int d;
			
 
				   LOG(h.verbose, "releasing oneapi library\n");
			
 
				-  for (d = 0; d < h.num_drivers; d++)
			
 
				-  {
			
 
				-    if (h.devices != NULL && h.devices[d] != NULL)
			
 
				-    {
			
 
				+  for (d = 0; d < h.num_drivers; d++) {
			
 
				+    if (h.devices != NULL && h.devices[d] != NULL) {
			
 
				       free(h.devices[d]);
			
 
				     }
			
 
				   }
			
 
				-  if (h.devices != NULL)
			
 
				-  {
			
 
				+  if (h.devices != NULL) {
			
 
				     free(h.devices);
			
 
				     h.devices = NULL;
			
 
				   }
			
 
				-  if (h.num_devices != NULL)
			
 
				-  {
			
 
				+  if (h.num_devices != NULL) {
			
 
				     free(h.num_devices);
			
 
				     h.num_devices = NULL;
			
 
				   }
			
 
				-  if (h.drivers != NULL)
			
 
				-  {
			
 
				+  if (h.drivers != NULL) {
			
 
				     free(h.drivers);
			
 
				     h.drivers = NULL;
			
 
				   }
			
@@ -261,14 +244,11 @@ void oneapi_release(oneapi_handle_t h)
 
				   h.handle = NULL;
			
 
				 }
			
 
				 
			
 
				-int oneapi_get_device_count(oneapi_handle_t h, int driver) 
			
 
				-{
			
 
				-  if (h.handle == NULL || h.num_devices == NULL) 
			
 
				-  {
			
 
				+int oneapi_get_device_count(oneapi_handle_t h, int driver) {
			
 
				+  if (h.handle == NULL || h.num_devices == NULL) {
			
 
				     return 0;
			
 
				   }
			
 
				-  if (driver > h.num_drivers)
			
 
				-  {
			
 
				+  if (driver > h.num_drivers) {
			
 
				     return 0;
			
 
				   }
			
 
				   return (int)h.num_devices[driver];
			
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -9,8 +9,7 @@
 
				 #define ZE_BIT(_i) (1 << _i)
			
 
				 
			
 
				 // Just enough typedef's to dlopen/dlsym for memory information
			
 
				-typedef enum ze_result_t
			
 
				-{
			
 
				+typedef enum ze_result_t {
			
 
				   ZE_RESULT_SUCCESS = 0,
			
 
				   // Other values omitted for now...
			
 
				 } ze_result_t;
			
@@ -20,13 +19,11 @@ typedef struct _zes_driver_handle_t *zes_driver_handle_t;
 
				 typedef struct _zes_device_handle_t *zes_device_handle_t;
			
 
				 typedef struct _zes_mem_handle_t *zes_mem_handle_t;
			
 
				 
			
 
				-typedef enum _ze_structure_type_t
			
 
				-{
			
 
				+typedef enum _ze_structure_type_t {
			
 
				   ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				 } ze_structure_type_t;
			
 
				 
			
 
				-typedef enum _zes_structure_type_t
			
 
				-{
			
 
				+typedef enum _zes_structure_type_t {
			
 
				   ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
			
 
				   ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
			
 
				   ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
			
@@ -34,35 +31,29 @@ typedef enum _zes_structure_type_t
 
				   ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				 } zes_structure_type_t;
			
 
				 
			
 
				-typedef enum _zes_mem_type_t
			
 
				-{
			
 
				+typedef enum _zes_mem_type_t {
			
 
				   ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				 } zes_mem_type_t;
			
 
				 
			
 
				-typedef enum _zes_mem_loc_t
			
 
				-{
			
 
				+typedef enum _zes_mem_loc_t {
			
 
				   ZES_MEM_LOC_SYSTEM = 0,
			
 
				   ZES_MEM_LOC_DEVICE = 1,
			
 
				   ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
			
 
				 } zes_mem_loc_t;
			
 
				 
			
 
				-typedef enum _zes_mem_health_t
			
 
				-{
			
 
				+typedef enum _zes_mem_health_t {
			
 
				   ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
			
 
				 } zes_mem_health_t;
			
 
				 
			
 
				-typedef struct _ze_device_uuid_t
			
 
				-{
			
 
				+typedef struct _ze_device_uuid_t {
			
 
				   uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
			
 
				 } ze_device_uuid_t;
			
 
				 
			
 
				-typedef struct _zes_uuid_t
			
 
				-{
			
 
				+typedef struct _zes_uuid_t {
			
 
				   uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
			
 
				 } zes_uuid_t;
			
 
				 
			
 
				-typedef enum _ze_device_type_t
			
 
				-{
			
 
				+typedef enum _ze_device_type_t {
			
 
				   ZE_DEVICE_TYPE_GPU = 1,
			
 
				   ZE_DEVICE_TYPE_CPU = 2,
			
 
				   ZE_DEVICE_TYPE_FPGA = 3,
			
@@ -71,8 +62,7 @@ typedef enum _ze_device_type_t
 
				   ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
			
 
				 } ze_device_type_t;
			
 
				 
			
 
				-typedef enum _zes_device_type_t
			
 
				-{
			
 
				+typedef enum _zes_device_type_t {
			
 
				   ZES_DEVICE_TYPE_GPU = 1,
			
 
				   ZES_DEVICE_TYPE_CPU = 2,
			
 
				   ZES_DEVICE_TYPE_FPGA = 3,
			
@@ -82,8 +72,7 @@ typedef enum _zes_device_type_t
 
				 } zes_device_type_t;
			
 
				 
			
 
				 typedef uint32_t ze_device_property_flags_t;
			
 
				-typedef enum _ze_device_property_flag_t
			
 
				-{
			
 
				+typedef enum _ze_device_property_flag_t {
			
 
				   ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
			
 
				   ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
			
 
				   ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
			
@@ -92,8 +81,7 @@ typedef enum _ze_device_property_flag_t
 
				 } ze_device_property_flag_t;
			
 
				 
			
 
				 typedef uint32_t zes_device_property_flags_t;
			
 
				-typedef enum _zes_device_property_flag_t
			
 
				-{
			
 
				+typedef enum _zes_device_property_flag_t {
			
 
				   ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
			
 
				   ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
			
 
				   ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
			
@@ -101,8 +89,7 @@ typedef enum _zes_device_property_flag_t
 
				   ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
			
 
				 } zes_device_property_flag_t;
			
 
				 
			
 
				-typedef struct _ze_device_properties_t
			
 
				-{
			
 
				+typedef struct _ze_device_properties_t {
			
 
				   ze_structure_type_t stype;
			
 
				   void *pNext;
			
 
				   ze_device_type_t type;
			
@@ -126,8 +113,7 @@ typedef struct _ze_device_properties_t
 
				   char name[ZE_MAX_DEVICE_NAME];
			
 
				 } ze_device_properties_t;
			
 
				 
			
 
				-typedef struct _zes_device_properties_t
			
 
				-{
			
 
				+typedef struct _zes_device_properties_t {
			
 
				   zes_structure_type_t stype;
			
 
				   void *pNext;
			
 
				   ze_device_properties_t core;
			
@@ -140,8 +126,7 @@ typedef struct _zes_device_properties_t
 
				   char driverVersion[ZES_STRING_PROPERTY_SIZE];
			
 
				 } zes_device_properties_t;
			
 
				 
			
 
				-typedef struct _zes_device_ext_properties_t
			
 
				-{
			
 
				+typedef struct _zes_device_ext_properties_t {
			
 
				   zes_structure_type_t stype;
			
 
				   void *pNext;
			
 
				   zes_uuid_t uuid;
			
@@ -149,8 +134,7 @@ typedef struct _zes_device_ext_properties_t
 
				   zes_device_property_flags_t flags;
			
 
				 } zes_device_ext_properties_t;
			
 
				 
			
 
				-typedef struct _zes_mem_properties_t
			
 
				-{
			
 
				+typedef struct _zes_mem_properties_t {
			
 
				   zes_structure_type_t stype;
			
 
				   void *pNext;
			
 
				   zes_mem_type_t type;
			
@@ -162,8 +146,7 @@ typedef struct _zes_mem_properties_t
 
				   int32_t numChannels;
			
 
				 } zes_mem_properties_t;
			
 
				 
			
 
				-typedef struct _zes_mem_state_t
			
 
				-{
			
 
				+typedef struct _zes_mem_state_t {
			
 
				   zes_structure_type_t stype;
			
 
				   const void *pNext;
			
 
				   zes_mem_health_t health;
			
@@ -171,15 +154,14 @@ typedef struct _zes_mem_state_t
 
				   uint64_t size;
			
 
				 } zes_mem_state_t;
			
 
				 
			
 
				-typedef struct oneapi_handle
			
 
				-{
			
 
				+typedef struct oneapi_handle {
			
 
				   void *handle;
			
 
				   uint16_t verbose;
			
 
				 
			
 
				   uint32_t num_drivers;
			
 
				-  zes_driver_handle_t *drivers; 
			
 
				+  zes_driver_handle_t *drivers;
			
 
				   uint32_t *num_devices;
			
 
				-  zes_device_handle_t **devices; 
			
 
				+  zes_device_handle_t **devices;
			
 
				 
			
 
				   // TODO Driver major, minor information
			
 
				   // int driver_major;
			
@@ -201,20 +183,19 @@ typedef struct oneapi_handle
 
				 
			
 
				 } oneapi_handle_t;
			
 
				 
			
 
				-typedef struct oneapi_init_resp
			
 
				-{
			
 
				+typedef struct oneapi_init_resp {
			
 
				   char *err; // If err is non-null handle is invalid
			
 
				   oneapi_handle_t oh;
			
 
				 } oneapi_init_resp_t;
			
 
				 
			
 
				-typedef struct oneapi_version_resp
			
 
				-{
			
 
				+typedef struct oneapi_version_resp {
			
 
				   ze_result_t status;
			
 
				   char *str; // Contains version or error string if status != 0
			
 
				 } oneapi_version_resp_t;
			
 
				 
			
 
				 void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
			
 
				-void oneapi_check_vram(oneapi_handle_t h, int driver, int device, mem_info_t *resp);
			
 
				+void oneapi_check_vram(oneapi_handle_t h, int driver, int device,
			
 
				+                       mem_info_t *resp);
			
 
				 void oneapi_release(oneapi_handle_t h);
			
 
				 int oneapi_get_device_count(oneapi_handle_t h, int driver);
			
 
				 
			
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@@ -0,0 +1,89 @@
 
				+package gpu
			
 
				+
			
 
				+import (
			
 
				+	"bufio"
			
 
				+	"fmt"
			
 
				+	"os"
			
 
				+	"strings"
			
 
				+
			
 
				+	"github.com/ollama/ollama/format"
			
 
				+)
			
 
				+
			
 
				+var CudartGlobs = []string{
			
 
				+	"/usr/local/cuda/lib64/libcudart.so*",
			
 
				+	"/usr/lib/x86_64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				+	"/usr/lib/x86_64-linux-gnu/libcudart.so*",
			
 
				+	"/usr/lib/wsl/lib/libcudart.so*",
			
 
				+	"/usr/lib/wsl/drivers/*/libcudart.so*",
			
 
				+	"/opt/cuda/lib64/libcudart.so*",
			
 
				+	"/usr/local/cuda*/targets/aarch64-linux/lib/libcudart.so*",
			
 
				+	"/usr/lib/aarch64-linux-gnu/nvidia/current/libcudart.so*",
			
 
				+	"/usr/lib/aarch64-linux-gnu/libcudart.so*",
			
 
				+	"/usr/local/cuda/lib*/libcudart.so*",
			
 
				+	"/usr/lib*/libcudart.so*",
			
 
				+	"/usr/local/lib*/libcudart.so*",
			
 
				+}
			
 
				+
			
 
				+var NvmlGlobs = []string{}
			
 
				+
			
 
				+var NvcudaGlobs = []string{
			
 
				+	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
			
 
				+	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
			
 
				+	"/usr/lib/*-linux-gnu/libcuda.so*",
			
 
				+	"/usr/lib/wsl/lib/libcuda.so*",
			
 
				+	"/usr/lib/wsl/drivers/*/libcuda.so*",
			
 
				+	"/opt/cuda/lib*/libcuda.so*",
			
 
				+	"/usr/local/cuda/lib*/libcuda.so*",
			
 
				+	"/usr/lib*/libcuda.so*",
			
 
				+	"/usr/local/lib*/libcuda.so*",
			
 
				+}
			
 
				+
			
 
				+var OneapiGlobs = []string{
			
 
				+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
			
 
				+	"/usr/lib*/libze_intel_gpu.so*",
			
 
				+}
			
 
				+
			
 
				+var CudartMgmtName = "libcudart.so*"
			
 
				+var NvcudaMgmtName = "libcuda.so*"
			
 
				+var NvmlMgmtName = "" // not currently wired on linux
			
 
				+var OneapiMgmtName = "libze_intel_gpu.so"
			
 
				+
			
 
				+func GetCPUMem() (memInfo, error) {
			
 
				+	var mem memInfo
			
 
				+	var total, available, free, buffers, cached uint64
			
 
				+	f, err := os.Open("/proc/meminfo")
			
 
				+	if err != nil {
			
 
				+		return mem, err
			
 
				+	}
			
 
				+	defer f.Close()
			
 
				+	s := bufio.NewScanner(f)
			
 
				+	for s.Scan() {
			
 
				+		line := s.Text()
			
 
				+		switch {
			
 
				+		case strings.HasPrefix(line, "MemTotal:"):
			
 
				+			_, err = fmt.Sscanf(line, "MemTotal:%d", &total)
			
 
				+		case strings.HasPrefix(line, "MemAvailable:"):
			
 
				+			_, err = fmt.Sscanf(line, "MemAvailable:%d", &available)
			
 
				+		case strings.HasPrefix(line, "MemFree:"):
			
 
				+			_, err = fmt.Sscanf(line, "MemFree:%d", &free)
			
 
				+		case strings.HasPrefix(line, "Buffers:"):
			
 
				+			_, err = fmt.Sscanf(line, "Buffers:%d", &buffers)
			
 
				+		case strings.HasPrefix(line, "Cached:"):
			
 
				+			_, err = fmt.Sscanf(line, "Cached:%d", &cached)
			
 
				+		default:
			
 
				+			continue
			
 
				+		}
			
 
				+		if err != nil {
			
 
				+			return mem, err
			
 
				+		}
			
 
				+
			
 
				+		if total > 0 && available > 0 {
			
 
				+			mem.TotalMemory = total * format.KibiByte
			
 
				+			mem.FreeMemory = available * format.KibiByte
			
 
				+			return mem, nil
			
 
				+		}
			
 
				+	}
			
 
				+	mem.TotalMemory = total * format.KibiByte
			
 
				+	mem.FreeMemory = (free + buffers + cached) * format.KibiByte
			
 
				+	return mem, nil
			
 
				+}
			
--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
@@ -0,0 +1,55 @@
 
				+package gpu
			
 
				+
			
 
				+import (
			
 
				+	"fmt"
			
 
				+	"syscall"
			
 
				+	"unsafe"
			
 
				+)
			
 
				+
			
 
				+type MEMORYSTATUSEX struct {
			
 
				+	length               uint32
			
 
				+	MemoryLoad           uint32
			
 
				+	TotalPhys            uint64
			
 
				+	AvailPhys            uint64
			
 
				+	TotalPageFile        uint64
			
 
				+	AvailPageFile        uint64
			
 
				+	TotalVirtual         uint64
			
 
				+	AvailVirtual         uint64
			
 
				+	AvailExtendedVirtual uint64
			
 
				+}
			
 
				+
			
 
				+var (
			
 
				+	k32                      = syscall.NewLazyDLL("kernel32.dll")
			
 
				+	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
			
 
				+	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
			
 
				+)
			
 
				+
			
 
				+var CudartGlobs = []string{
			
 
				+	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
			
 
				+}
			
 
				+
			
 
				+var NvmlGlobs = []string{
			
 
				+	"c:\\Windows\\System32\\nvml.dll",
			
 
				+}
			
 
				+
			
 
				+var NvcudaGlobs = []string{
			
 
				+	"c:\\windows\\system*\\nvcuda.dll",
			
 
				+}
			
 
				+
			
 
				+var OneapiGlobs = []string{
			
 
				+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
			
 
				+}
			
 
				+
			
 
				+var CudartMgmtName = "cudart64_*.dll"
			
 
				+var NvcudaMgmtName = "nvcuda.dll"
			
 
				+var NvmlMgmtName = "nvml.dll"
			
 
				+var OneapiMgmtName = "ze_intel_gpu64.dll"
			
 
				+
			
 
				+func GetCPUMem() (memInfo, error) {
			
 
				+	memStatus := MEMORYSTATUSEX{length: sizeofMemoryStatusEx}
			
 
				+	r1, _, err := globalMemoryStatusExProc.Call(uintptr(unsafe.Pointer(&memStatus)))
			
 
				+	if r1 == 0 {
			
 
				+		return memInfo{}, fmt.Errorf("GlobalMemoryStatusEx failed: %w", err)
			
 
				+	}
			
 
				+	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys}, nil
			
 
				+}
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -18,7 +18,7 @@ type GpuInfo struct {
 
				 	Library string `json:"library,omitempty"`
			
 
				 
			
 
				 	// Optional variant to select (e.g. versions, cpu feature flags)
			
 
				-	Variant string `json:"variant,omitempty"`
			
 
				+	Variant CPUCapability `json:"variant"`
			
 
				 
			
 
				 	// MinimumMemory represents the minimum memory required to use the GPU
			
 
				 	MinimumMemory uint64 `json:"-"`
			
@@ -44,21 +44,21 @@ type CPUInfo struct {
 
				 
			
 
				 type CudaGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	index int // nolint: unused
			
 
				+	index int //nolint:unused,nolintlint
			
 
				 }
			
 
				 type CudaGPUInfoList []CudaGPUInfo
			
 
				 
			
 
				 type RocmGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	usedFilepath string // nolint: unused
			
 
				-	index        int    // nolint: unused
			
 
				+	usedFilepath string //nolint:unused,nolintlint
			
 
				+	index        int    //nolint:unused,nolintlint
			
 
				 }
			
 
				 type RocmGPUInfoList []RocmGPUInfo
			
 
				 
			
 
				 type OneapiGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	driverIndex int // nolint: unused
			
 
				-	gpuIndex    int // nolint: unused
			
 
				+	driverIndex int //nolint:unused,nolintlint
			
 
				+	gpuIndex    int //nolint:unused,nolintlint
			
 
				 }
			
 
				 type OneapiGPUInfoList []OneapiGPUInfo
			
 
				 
			
@@ -71,8 +71,8 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 
				 	for _, info := range l {
			
 
				 		found := false
			
 
				 		requested := info.Library
			
 
				-		if info.Variant != "" {
			
 
				-			requested += "_" + info.Variant
			
 
				+		if info.Variant != CPUCapabilityNone {
			
 
				+			requested += "_" + info.Variant.String()
			
 
				 		}
			
 
				 		for i, lib := range libs {
			
 
				 			if lib == requested {
			
@@ -117,30 +117,19 @@ type CPUCapability uint32
 
				 var GPURunnerCPUCapability = CPUCapabilityAVX
			
 
				 
			
 
				 const (
			
 
				-	CPUCapabilityBase CPUCapability = iota
			
 
				+	CPUCapabilityNone CPUCapability = iota
			
 
				 	CPUCapabilityAVX
			
 
				 	CPUCapabilityAVX2
			
 
				 	// TODO AVX512
			
 
				 )
			
 
				 
			
 
				-func (c CPUCapability) ToString() string {
			
 
				-	switch c {
			
 
				-	case CPUCapabilityAVX:
			
 
				-		return "AVX"
			
 
				-	case CPUCapabilityAVX2:
			
 
				-		return "AVX2"
			
 
				-	default:
			
 
				-		return "no vector extensions"
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (c CPUCapability) ToVariant() string {
			
 
				+func (c CPUCapability) String() string {
			
 
				 	switch c {
			
 
				 	case CPUCapabilityAVX:
			
 
				 		return "avx"
			
 
				 	case CPUCapabilityAVX2:
			
 
				 		return "avx2"
			
 
				 	default:
			
 
				-		return ""
			
 
				+		return "no vector extensions"
			
 
				 	}
			
 
				 }
			
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,8 @@ import (
 
				 )
			
 
				 
			
 
				 func TestContextExhaustion(t *testing.T) {
			
 
				-	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute) // Longer needed for small footprint GPUs
			
 
				+	// Longer needed for small footprint GPUs
			
 
				+	ctx, cancel := context.WithTimeout(context.Background(), 6*time.Minute)
			
 
				 	defer cancel()
			
 
				 	// Set up the test data
			
 
				 	req := api.GenerateRequest{
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,7 +1,6 @@
 
				 package llm
			
 
				 
			
 
				 import (
			
 
				-	"fmt"
			
 
				 	"log/slog"
			
 
				 	"strconv"
			
 
				 	"strings"
			
@@ -69,13 +68,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 
			
 
				 	// Conditional output size on GPU 0
			
 
				 	var memoryLayerOutput uint64
			
 
				-	var includeOutput bool
			
 
				 
			
 
				-	// One extra layer as a pad for each GPU
			
 
				-	var layerBuffer uint64
			
 
				-
			
 
				-	// The sizes of the main layers
			
 
				-	var layerSizes []uint64
			
 
				+	// The sizes of a layer
			
 
				+	var layerSize uint64
			
 
				 
			
 
				 	// The sum of all the layer sizes (just for logging)
			
 
				 	var memoryWeights uint64
			
@@ -102,12 +97,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	layers := ggml.Tensors().Layers()
			
 
				 	// add one layer worth of memory as a buffer
			
 
				 	if blk0, ok := layers["blk.0"]; ok {
			
 
				-		layerBuffer = blk0.size()
			
 
				+		layerSize = blk0.size()
			
 
				+	} else {
			
 
				+		slog.Warn("model missing blk.0 layer size")
			
 
				 	}
			
 
				 
			
 
				 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
			
 
				 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
			
 
				 
			
 
				+	// KV is proportional to the number of layers
			
 
				+	layerSize += kv / ggml.KV().BlockCount()
			
 
				+
			
 
				 	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
			
 
				 	if graphPartialOffload == 0 {
			
 
				 		graphPartialOffload = ggml.KV().GQA() * kv / 6
			
@@ -119,6 +119,9 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	// on metal there's no partial offload overhead
			
 
				 	if gpus[0].Library == "metal" {
			
 
				 		graphPartialOffload = graphFullOffload
			
 
				+	} else if len(gpus) > 1 {
			
 
				+		// multigpu should always use the partial graph size
			
 
				+		graphFullOffload = graphPartialOffload
			
 
				 	}
			
 
				 
			
 
				 	if layer, ok := layers["output_norm"]; ok {
			
@@ -130,16 +133,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		memoryLayerOutput += layer.size()
			
 
				 	}
			
 
				 
			
 
				-	if gpus[0].Library == "metal" && opts.UseMMap {
			
 
				-		includeOutput = true
			
 
				-	} else if gpus[0].Library != "metal" || !opts.UseMMap {
			
 
				-		includeOutput = true
			
 
				-	}
			
 
				-
			
 
				+	// Output layer handled at the end if we have space
			
 
				 	gpuZeroOverhead := projectorSize
			
 
				-	if includeOutput {
			
 
				-		gpuZeroOverhead += memoryLayerOutput
			
 
				-	}
			
 
				 
			
 
				 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
			
 
				 	var layerCount int
			
@@ -156,12 +151,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			gzo = gpuZeroOverhead
			
 
				 		}
			
 
				 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
			
 
				-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
			
 
				+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
			
 
				 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
			
 
				 			continue
			
 
				 		}
			
 
				 		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
			
 
				-		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
			
 
				+		gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
			
 
				 	}
			
 
				 
			
 
				 	var gpuZeroID int
			
@@ -170,23 +165,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
			
 
				 	}
			
 
				 
			
 
				-	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
			
 
				+	// For all the layers, find where they can fit on the GPU(s)
			
 
				 	for i := range int(ggml.KV().BlockCount()) {
			
 
				-		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			
 
				-			memoryLayer := blk.size()
			
 
				-
			
 
				-			// KV is proportional to the number of layers
			
 
				-			memoryLayer += kv / ggml.KV().BlockCount()
			
 
				-			layerSizes[i] = memoryLayer
			
 
				-			memoryWeights += memoryLayer
			
 
				-		}
			
 
				-	}
			
 
				+		memoryWeights += layerSize
			
 
				 
			
 
				-	// For all the layers, find where they can fit on the GPU(s)
			
 
				-	for i := range layerSizes {
			
 
				-		if layerSizes[i] == 0 {
			
 
				-			continue
			
 
				-		}
			
 
				 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
			
 
				 			// Stop allocating on GPU(s) once we hit the users target NumGPU
			
 
				 			continue
			
@@ -196,8 +178,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				 			g := gpusWithSpace[i%j]
			
 
				 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				-			if g.g.FreeMemory > used+layerSizes[i] {
			
 
				-				gpuAllocations[g.i] += layerSizes[i]
			
 
				+			if g.g.FreeMemory > used+layerSize {
			
 
				+				gpuAllocations[g.i] += layerSize
			
 
				 				layerCounts[g.i]++
			
 
				 				layerCount++
			
 
				 				break
			
@@ -205,17 +187,18 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
			
 
				 			}
			
 
				 		}
			
 
				-
			
 
				 	}
			
 
				 	if layerCount >= int(ggml.KV().BlockCount()) {
			
 
				 		fullyLoaded = true
			
 
				 	} else {
			
 
				 		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
			
 
				-			overflow += layerSizes[i]
			
 
				+			overflow += layerSize
			
 
				 		}
			
 
				 	}
			
 
				-	// Find where the output fits
			
 
				-	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
			
 
				+
			
 
				+	// Determine if we need to consider output then find where it fits
			
 
				+	if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
			
 
				+		memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
			
 
				 		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				 			g := gpusWithSpace[layerCount%j]
			
 
				 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
@@ -226,6 +209,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 				break
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				 		if layerCount < int(ggml.KV().BlockCount())+1 {
			
 
				 			fullyLoaded = false
			
 
				 			overflow += memoryLayerOutput
			
@@ -253,7 +237,6 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	var memoryRequiredPartial, memoryRequiredTotal uint64
			
 
				 	for i := range gpuAllocations {
			
 
				 		memoryRequiredPartial += gpuAllocations[i]
			
 
				-
			
 
				 	}
			
 
				 	memoryRequiredTotal = memoryRequiredPartial + overflow
			
 
				 
			
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -18,7 +18,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 	envconfig.Debug = true
			
 
				 	modelName := "dummy"
			
 
				 	f, err := os.CreateTemp(t.TempDir(), modelName)
			
 
				-	assert.Nil(t, err)
			
 
				+	require.NoError(t, err)
			
 
				 	defer f.Close()
			
 
				 	gguf := NewGGUFV3(binary.LittleEndian)
			
 
				 	inputLayerCount := 5
			
@@ -30,7 +30,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				 		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				 	}
			
 
				-	assert.Equal(t, inputLayerCount+1, len(tensors))
			
 
				+	assert.Len(t, tensors, inputLayerCount+1)
			
 
				 	err = gguf.Encode(f, KV{
			
 
				 		"general.architecture":          "llama",
			
 
				 		"general.name":                  "name",
			
@@ -56,9 +56,11 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 	}
			
 
				 	projectors := []string{}
			
 
				 	opts := api.DefaultOptions()
			
 
				-	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				-	assert.Equal(t, 0, estimate.Layers)
			
 
				-	assert.Equal(t, uint64(0), estimate.Graph)
			
 
				+	t.Run("cpu", func(t *testing.T) {
			
 
				+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		assert.Equal(t, 0, estimate.Layers)
			
 
				+		assert.Equal(t, uint64(0), estimate.Graph)
			
 
				+	})
			
 
				 
			
 
				 	// derived from the dummy ggml file above
			
 
				 	graphPartialOffload := uint64(202377216)
			
@@ -80,7 +82,10 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 		},
			
 
				 	}
			
 
				 	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
			
 
				-	for i, s := range [][]uint64{
			
 
				+	for i, s := range []struct {
			
 
				+		layer0, layer1   uint64
			
 
				+		expect0, expect1 uint64
			
 
				+	}{
			
 
				 		{1, 1, 1, 1},
			
 
				 		{2, 1, 2, 1},
			
 
				 		{2, 2, 2, 2},
			
@@ -90,27 +95,33 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 		{6, 6, 3, 3},
			
 
				 		{0, 3, 0, 3},
			
 
				 	} {
			
 
				-		gpus[0].FreeMemory = 0
			
 
				-		gpus[1].FreeMemory = 0
			
 
				-		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
			
 
				-		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
			
 
				-		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
			
 
				-		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				-		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				-		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				-		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
			
 
				-		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
			
 
				-		var layerSums uint64
			
 
				-		for _, b := range estimate.GPUSizes {
			
 
				-			layerSums += b
			
 
				-		}
			
 
				-		if estimate.Layers < inputLayerCount+1 {
			
 
				-			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				-			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				-		} else {
			
 
				-			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				-			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				-		}
			
 
				+		t.Run(fmt.Sprintf("%v", s), func(t *testing.T) {
			
 
				+			gpus[0].FreeMemory = 0
			
 
				+			gpus[1].FreeMemory = 0
			
 
				+			gpus[0].FreeMemory += projectorSize
			
 
				+			if s.layer0 > 0 {
			
 
				+				gpus[0].FreeMemory += memoryLayerOutput
			
 
				+			} else {
			
 
				+				gpus[1].FreeMemory += memoryLayerOutput
			
 
				+			}
			
 
				+			gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s.layer0*layerSize + 1
			
 
				+			gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s.layer1*layerSize + 1
			
 
				+			gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				+			gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				+			estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+			assert.Equal(t, int(s.expect0+s.expect1), estimate.Layers, "scenario %d: %v", i, s)
			
 
				+			assert.Equal(t, fmt.Sprintf("%d,%d", s.expect0, s.expect1), estimate.TensorSplit, "scenario %d: %v", i, s)
			
 
				+			var layerSums uint64
			
 
				+			for _, b := range estimate.GPUSizes {
			
 
				+				layerSums += b
			
 
				+			}
			
 
				+			if estimate.Layers < inputLayerCount+1 {
			
 
				+				assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				+				assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				+			} else {
			
 
				+				assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				+				assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				+			}
			
 
				+		})
			
 
				 	}
			
 
				-
			
 
				 }
			
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -82,8 +82,8 @@ func serversForGpu(info gpu.GpuInfo) []string {
 
				 	// glob workDir for files that start with ollama_
			
 
				 	availableServers := availableServers()
			
 
				 	requested := info.Library
			
 
				-	if info.Variant != "" {
			
 
				-		requested += "_" + info.Variant
			
 
				+	if info.Variant != gpu.CPUCapabilityNone {
			
 
				+		requested += "_" + info.Variant.String()
			
 
				 	}
			
 
				 
			
 
				 	servers := []string{}
			
@@ -117,14 +117,14 @@ func serversForGpu(info gpu.GpuInfo) []string {
 
				 
			
 
				 	// Load up the best CPU variant if not primary requested
			
 
				 	if info.Library != "cpu" {
			
 
				-		variant := gpu.GetCPUVariant()
			
 
				+		variant := gpu.GetCPUCapability()
			
 
				 		// If no variant, then we fall back to default
			
 
				 		// If we have a variant, try that if we find an exact match
			
 
				 		// Attempting to run the wrong CPU instructions will panic the
			
 
				 		// process
			
 
				-		if variant != "" {
			
 
				+		if variant != gpu.CPUCapabilityNone {
			
 
				 			for cmp := range availableServers {
			
 
				-				if cmp == "cpu_"+variant {
			
 
				+				if cmp == "cpu_"+variant.String() {
			
 
				 					servers = append(servers, cmp)
			
 
				 					break
			
 
				 				}
			
@@ -146,11 +146,11 @@ func serverForCpu() string {
 
				 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
			
 
				 		return "metal"
			
 
				 	}
			
 
				-	variant := gpu.GetCPUVariant()
			
 
				+	variant := gpu.GetCPUCapability()
			
 
				 	availableServers := availableServers()
			
 
				-	if variant != "" {
			
 
				+	if variant != gpu.CPUCapabilityNone {
			
 
				 		for cmp := range availableServers {
			
 
				-			if cmp == "cpu_"+variant {
			
 
				+			if cmp == "cpu_"+variant.String() {
			
 
				 				return cmp
			
 
				 			}
			
 
				 		}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -39,7 +39,7 @@ type LlamaServer interface {
 
				 	Close() error
			
 
				 	EstimatedVRAM() uint64 // Total VRAM across all GPUs
			
 
				 	EstimatedTotal() uint64
			
 
				-	EstimagedVRAMByGPU(gpuID string) uint64
			
 
				+	EstimatedVRAMByGPU(gpuID string) uint64
			
 
				 }
			
 
				 
			
 
				 // llmServer is an instance of the llama.cpp server
			
@@ -1016,7 +1016,7 @@ func (s *llmServer) EstimatedTotal() uint64 {
 
				 	return s.estimate.TotalSize
			
 
				 }
			
 
				 
			
 
				-func (s *llmServer) EstimagedVRAMByGPU(gpuID string) uint64 {
			
 
				+func (s *llmServer) EstimatedVRAMByGPU(gpuID string) uint64 {
			
 
				 	for i, gpu := range s.gpus {
			
 
				 		if gpu.ID == gpuID {
			
 
				 			return s.estimate.GPUSizes[i]
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -182,7 +182,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						// We want to avoid loading on any GPUs that have other
			
 
				 						// models still loading on them to avoid potential races
			
 
				 						// with VRAM consumption ramping up during load
			
 
				-						availGpus := s.filterGPUsWithLoadingModels(gpus)
			
 
				+						availGpus := s.filterGPUsWithoutLoadingModels(gpus)
			
 
				 
			
 
				 						// Update free memory from currently loaded models
			
 
				 						s.updateFreeSpace(availGpus)
			
@@ -414,9 +414,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 
				 		r.refMu.Lock()
			
 
				 		if r.llama != nil {
			
 
				 			for _, gpu := range allGpus {
			
 
				-				// if slices.Contains(gpuIDs, gpu.ID) {
			
 
				-				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimagedVRAMByGPU(gpu.ID)
			
 
				-				// }
			
 
				+				predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID)
			
 
				 			}
			
 
				 		} else {
			
 
				 			slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
			
@@ -448,7 +446,7 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 
				 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
			
 
				 // This routine returns the set of GPUs that do not have an active loading model.
			
 
				 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
			
 
				-func (s *Scheduler) filterGPUsWithLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
			
 
				+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
			
 
				 	ret := append(gpu.GpuInfoList{}, allGpus...)
			
 
				 	s.loadedMu.Lock()
			
 
				 	defer s.loadedMu.Unlock()
			
@@ -702,5 +700,4 @@ func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML,
 
				 	// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
			
 
				 
			
 
				 	return s.findRunnerToUnload()
			
 
				-
			
 
				 }
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -156,7 +156,7 @@ func TestRequests(t *testing.T) {
 
				 
			
 
				 	// Same model, same request
			
 
				 	scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
			
 
				-	scenario1a.req.sessionDuration = 0
			
 
				+	scenario1a.req.sessionDuration = 5 * time.Millisecond
			
 
				 	scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
			
 
				 	scenario1b.req.model = scenario1a.req.model
			
 
				 	scenario1b.ggml = scenario1a.ggml
			
@@ -167,6 +167,7 @@ func TestRequests(t *testing.T) {
 
				 	tmpModel := *scenario1a.req.model
			
 
				 	scenario2a.req.model = &tmpModel
			
 
				 	scenario2a.ggml = scenario1a.ggml
			
 
				+	scenario2a.req.sessionDuration = 5 * time.Millisecond
			
 
				 
			
 
				 	// Multiple loaded models
			
 
				 	scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
			
@@ -316,7 +317,6 @@ func TestGetRunner(t *testing.T) {
 
				 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				 	defer done()
			
 
				 
			
 
				-	// Same model, same request
			
 
				 	scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
			
 
				 	scenario1a.req.sessionDuration = 0
			
 
				 	scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
			
@@ -475,6 +475,40 @@ func TestUpdateFreeSpace(t *testing.T) {
 
				 	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
			
 
				 }
			
 
				 
			
 
				+func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
			
 
				+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				+	defer done()
			
 
				+	gpus := gpu.GpuInfoList{
			
 
				+		{
			
 
				+			Library: "cuda",
			
 
				+			ID:      "0",
			
 
				+		},
			
 
				+		{
			
 
				+			Library: "cuda",
			
 
				+			ID:      "1",
			
 
				+		},
			
 
				+	}
			
 
				+	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
			
 
				+
			
 
				+	s := InitScheduler(ctx)
			
 
				+	s.loadedMu.Lock()
			
 
				+	s.loaded["a"] = r1
			
 
				+	s.loadedMu.Unlock()
			
 
				+
			
 
				+	tmp := s.filterGPUsWithoutLoadingModels(gpus)
			
 
				+	require.Len(t, tmp, 1)
			
 
				+	require.Equal(t, "1", tmp[0].ID)
			
 
				+
			
 
				+	r1.gpus = gpu.GpuInfoList{gpus[1]}
			
 
				+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
			
 
				+	require.Len(t, tmp, 1)
			
 
				+	require.Equal(t, "0", tmp[0].ID)
			
 
				+
			
 
				+	r1.gpus = gpu.GpuInfoList{}
			
 
				+	tmp = s.filterGPUsWithoutLoadingModels(gpus)
			
 
				+	require.Len(t, tmp, 2)
			
 
				+}
			
 
				+
			
 
				 func TestFindRunnerToUnload(t *testing.T) {
			
 
				 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				 	defer done()
			
@@ -607,4 +641,4 @@ func (s *mockLlm) Close() error {
 
				 }
			
 
				 func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
			
 
				 func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
			
 
				-func (s *mockLlm) EstimagedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
			
 
				+func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }