6 月之前 · f3c8b898cd
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -47,10 +47,11 @@ var (
 
				 )
			
 
				 
			
 
				 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
			
 
				-func AMDGetGPUInfo() []RocmGPUInfo {
			
 
				+// Only called once during bootstrap
			
 
				+func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
			
 
				 	resp := []RocmGPUInfo{}
			
 
				 	if !AMDDetected() {
			
 
				-		return resp
			
 
				+		return resp, fmt.Errorf("AMD GPUs not detected")
			
 
				 	}
			
 
				 
			
 
				 	// Opportunistic logging of driver version to aid in troubleshooting
			
@@ -194,13 +195,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 
			
 
				 		// Shouldn't happen, but just in case...
			
 
				 		if gpuID < 0 {
			
 
				-			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
			
 
				-			return nil
			
 
				-		}
			
 
				-
			
 
				-		if int(major) < RocmComputeMin {
			
 
				-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
			
 
				-			continue
			
 
				+			err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
			
 
				+			slog.Error(err.Error())
			
 
				+			return nil, err
			
 
				 		}
			
 
				 
			
 
				 		// Look up the memory for the current node
			
@@ -270,19 +267,12 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			break
			
 
				 		}
			
 
				 
			
 
				-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
			
 
				-		if totalMemory < IGPUMemLimit {
			
 
				-			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
			
 
				-			continue
			
 
				-		}
			
 
				 		var name string
			
 
				 		// TODO - PCI ID lookup
			
 
				 		if vendor > 0 && device > 0 {
			
 
				 			name = fmt.Sprintf("%04x:%04x", vendor, device)
			
 
				 		}
			
 
				 
			
 
				-		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
			
 
				-		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
			
 
				 		gpuInfo := RocmGPUInfo{
			
 
				 			GpuInfo: GpuInfo{
			
 
				 				Library: "rocm",
			
@@ -300,6 +290,31 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			usedFilepath: usedFile,
			
 
				 		}
			
 
				 
			
 
				+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
			
 
				+		if totalMemory < IGPUMemLimit {
			
 
				+			reason := "unsupported Radeon iGPU detected skipping"
			
 
				+			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
			
 
				+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+				GpuInfo: gpuInfo.GpuInfo,
			
 
				+				Reason:  reason,
			
 
				+			})
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		if int(major) < RocmComputeMin {
			
 
				+			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
			
 
				+			slog.Warn(reason, "gpu", gpuID)
			
 
				+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+				GpuInfo: gpuInfo.GpuInfo,
			
 
				+				Reason:  reason,
			
 
				+			})
			
 
				+
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
			
 
				+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
			
 
				+
			
 
				 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
			
 
				 		if len(visibleDevices) > 0 {
			
 
				 			include := false
			
@@ -310,7 +325,13 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 				}
			
 
				 			}
			
 
				 			if !include {
			
 
				-				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
			
 
				+				reason := "filtering out device per user request"
			
 
				+				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
			
 
				+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+					GpuInfo: gpuInfo.GpuInfo,
			
 
				+					Reason:  reason,
			
 
				+				})
			
 
				+
			
 
				 				continue
			
 
				 			}
			
 
				 		}
			
@@ -320,8 +341,13 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 		if libDir == "" {
			
 
				 			libDir, err = AMDValidateLibDir()
			
 
				 			if err != nil {
			
 
				-				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
			
 
				-				return nil
			
 
				+				err = fmt.Errorf("unable to verify rocm library: %w", err)
			
 
				+				slog.Warn(err.Error())
			
 
				+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+					GpuInfo: gpuInfo.GpuInfo,
			
 
				+					Reason:  err.Error(),
			
 
				+				})
			
 
				+				return nil, err
			
 
				 			}
			
 
				 		}
			
 
				 		gpuInfo.DependencyPath = libDir
			
@@ -331,14 +357,25 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			if len(supported) == 0 {
			
 
				 				supported, err = GetSupportedGFX(libDir)
			
 
				 				if err != nil {
			
 
				-					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			
 
				-					return nil
			
 
				+					err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			
 
				+					slog.Warn(err.Error())
			
 
				+					unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+						GpuInfo: gpuInfo.GpuInfo,
			
 
				+						Reason:  err.Error(),
			
 
				+					})
			
 
				+					return nil, err
			
 
				 				}
			
 
				 				slog.Debug("rocm supported GPUs", "types", supported)
			
 
				 			}
			
 
				 			gfx := gpuInfo.Compute
			
 
				 			if !slices.Contains[[]string, string](supported, gfx) {
			
 
				-				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
			
 
				+				reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			
 
				+				slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			
 
				+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+					GpuInfo: gpuInfo.GpuInfo,
			
 
				+					Reason:  reason,
			
 
				+				})
			
 
				+
			
 
				 				// TODO - consider discrete markdown just for ROCM troubleshooting?
			
 
				 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
			
 
				 				continue
			
@@ -358,13 +395,16 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 		resp = append(resp, gpuInfo)
			
 
				 	}
			
 
				 	if len(resp) == 0 {
			
 
				-		slog.Info("no compatible amdgpu devices detected")
			
 
				+		err := fmt.Errorf("no compatible amdgpu devices detected")
			
 
				+		slog.Info(err.Error())
			
 
				+		return nil, err
			
 
				 	}
			
 
				 	if err := verifyKFDDriverAccess(); err != nil {
			
 
				-		slog.Error("amdgpu devices detected but permission problems block access", "error", err)
			
 
				-		return nil
			
 
				+		err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
			
 
				+		slog.Error(err.Error())
			
 
				+		return nil, err
			
 
				 	}
			
 
				-	return resp
			
 
				+	return resp, nil
			
 
				 }
			
 
				 
			
 
				 // Quick check for AMD driver so we can skip amdgpu discovery if not present
			
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -3,6 +3,7 @@ package gpu
 
				 import (
			
 
				 	"bytes"
			
 
				 	"errors"
			
 
				+	"fmt"
			
 
				 	"log/slog"
			
 
				 	"os"
			
 
				 	"path/filepath"
			
@@ -26,12 +27,13 @@ var (
 
				 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
			
 
				 )
			
 
				 
			
 
				-func AMDGetGPUInfo() []RocmGPUInfo {
			
 
				+// Only called once during bootstrap
			
 
				+func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
			
 
				 	resp := []RocmGPUInfo{}
			
 
				 	hl, err := NewHipLib()
			
 
				 	if err != nil {
			
 
				 		slog.Debug(err.Error())
			
 
				-		return nil
			
 
				+		return nil, err
			
 
				 	}
			
 
				 	defer hl.Release()
			
 
				 
			
@@ -44,12 +46,15 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
			
 
				 	count := hl.HipGetDeviceCount()
			
 
				 	if count == 0 {
			
 
				-		return nil
			
 
				+		err := fmt.Errorf("no compatible amdgpu devices detected")
			
 
				+		slog.Info(err.Error())
			
 
				+		return nil, err
			
 
				 	}
			
 
				 	libDir, err := AMDValidateLibDir()
			
 
				 	if err != nil {
			
 
				-		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
			
 
				-		return nil
			
 
				+		err = fmt.Errorf("unable to verify rocm library: %w", err)
			
 
				+		slog.Warn(err.Error())
			
 
				+		return nil, err
			
 
				 	}
			
 
				 
			
 
				 	var supported []string
			
@@ -57,8 +62,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 	if gfxOverride == "" {
			
 
				 		supported, err = GetSupportedGFX(libDir)
			
 
				 		if err != nil {
			
 
				-			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
			
 
				-			return nil
			
 
				+			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
			
 
				+			slog.Warn(err.Error())
			
 
				+			return nil, err
			
 
				 		}
			
 
				 	} else {
			
 
				 		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
			
@@ -87,21 +93,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
			
 
				 		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
			
 
				 		// TODO  Why isn't props.iGPU accurate!?
			
 
				-		if strings.EqualFold(name, iGPUName) {
			
 
				-			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
			
 
				-			continue
			
 
				-		}
			
 
				-		if gfxOverride == "" {
			
 
				-			// Strip off Target Features when comparing
			
 
				-			if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			
 
				-				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
			
 
				-				// TODO - consider discrete markdown just for ROCM troubleshooting?
			
 
				-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
			
 
				-				continue
			
 
				-			} else {
			
 
				-				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
			
 
				-			}
			
 
				-		}
			
 
				 
			
 
				 		freeMemory, totalMemory, err := hl.HipMemGetInfo()
			
 
				 		if err != nil {
			
@@ -109,14 +100,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
			
 
				-		if totalMemory < IGPUMemLimit {
			
 
				-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
			
 
				-			continue
			
 
				-		}
			
 
				-
			
 
				-		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
			
 
				-		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
			
 
				 		gpuInfo := RocmGPUInfo{
			
 
				 			GpuInfo: GpuInfo{
			
 
				 				Library: "rocm",
			
@@ -138,10 +121,38 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			index: i,
			
 
				 		}
			
 
				 
			
 
				+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
			
 
				+		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
			
 
				+			reason := "unsupported Radeon iGPU detected skipping"
			
 
				+			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
			
 
				+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+				GpuInfo: gpuInfo.GpuInfo,
			
 
				+				Reason:  reason,
			
 
				+			})
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// Strip off Target Features when comparing
			
 
				+		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
			
 
				+			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
			
 
				+			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
			
 
				+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
			
 
				+				GpuInfo: gpuInfo.GpuInfo,
			
 
				+				Reason:  reason,
			
 
				+			})
			
 
				+			// HSA_OVERRIDE_GFX_VERSION not supported on windows
			
 
				+			continue
			
 
				+		} else {
			
 
				+			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
			
 
				+		}
			
 
				+
			
 
				+		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
			
 
				+		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
			
 
				+
			
 
				 		resp = append(resp, gpuInfo)
			
 
				 	}
			
 
				 
			
 
				-	return resp
			
 
				+	return resp, nil
			
 
				 }
			
 
				 
			
 
				 func AMDValidateLibDir() (string, error) {
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -54,6 +54,13 @@ var (
 
				 	nvmlLibPath   string
			
 
				 	rocmGPUs      []RocmGPUInfo
			
 
				 	oneapiGPUs    []OneapiGPUInfo
			
 
				+
			
 
				+	// If any discovered GPUs are incompatible, report why
			
 
				+	unsupportedGPUs []UnsupportedGPUInfo
			
 
				+
			
 
				+	// Keep track of errors during bootstrapping so that if GPUs are missing
			
 
				+	// they expected to be present this may explain why
			
 
				+	bootstrapErrors []error
			
 
				 )
			
 
				 
			
 
				 // With our current CUDA compile flags, older than 5.0 will not work properly
			
@@ -70,16 +77,17 @@ func initCudaHandles() *cudaHandles {
 
				 
			
 
				 	cHandles := &cudaHandles{}
			
 
				 	// Short Circuit if we already know which library to use
			
 
				+	// ignore bootstrap errors in this case since we already recorded them
			
 
				 	if nvmlLibPath != "" {
			
 
				-		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
			
 
				+		cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
			
 
				 		return cHandles
			
 
				 	}
			
 
				 	if nvcudaLibPath != "" {
			
 
				-		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
			
 
				+		cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
			
 
				 		return cHandles
			
 
				 	}
			
 
				 	if cudartLibPath != "" {
			
 
				-		cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
			
 
				+		cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
			
 
				 		return cHandles
			
 
				 	}
			
 
				 
			
@@ -102,18 +110,21 @@ func initCudaHandles() *cudaHandles {
 
				 	if len(NvmlGlobs) > 0 {
			
 
				 		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
			
 
				 		if len(nvmlLibPaths) > 0 {
			
 
				-			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
			
 
				+			nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
			
 
				 			if nvml != nil {
			
 
				 				slog.Debug("nvidia-ml loaded", "library", libPath)
			
 
				 				cHandles.nvml = nvml
			
 
				 				nvmlLibPath = libPath
			
 
				 			}
			
 
				+			if err != nil {
			
 
				+				bootstrapErrors = append(bootstrapErrors, err)
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
			
 
				 	if len(nvcudaLibPaths) > 0 {
			
 
				-		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
			
 
				+		deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
			
 
				 		if nvcuda != nil {
			
 
				 			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
			
 
				 			cHandles.nvcuda = nvcuda
			
@@ -121,11 +132,14 @@ func initCudaHandles() *cudaHandles {
 
				 			nvcudaLibPath = libPath
			
 
				 			return cHandles
			
 
				 		}
			
 
				+		if err != nil {
			
 
				+			bootstrapErrors = append(bootstrapErrors, err)
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
			
 
				 	if len(cudartLibPaths) > 0 {
			
 
				-		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
			
 
				+		deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
			
 
				 		if cudart != nil {
			
 
				 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
			
 
				 			cHandles.cudart = cudart
			
@@ -133,6 +147,9 @@ func initCudaHandles() *cudaHandles {
 
				 			cudartLibPath = libPath
			
 
				 			return cHandles
			
 
				 		}
			
 
				+		if err != nil {
			
 
				+			bootstrapErrors = append(bootstrapErrors, err)
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	return cHandles
			
@@ -143,14 +160,19 @@ func initOneAPIHandles() *oneapiHandles {
 
				 	oHandles := &oneapiHandles{}
			
 
				 
			
 
				 	// Short Circuit if we already know which library to use
			
 
				+	// ignore bootstrap errors in this case since we already recorded them
			
 
				 	if oneapiLibPath != "" {
			
 
				-		oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
			
 
				+		oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
			
 
				 		return oHandles
			
 
				 	}
			
 
				 
			
 
				 	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
			
 
				 	if len(oneapiLibPaths) > 0 {
			
 
				-		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
			
 
				+		var err error
			
 
				+		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
			
 
				+		if err != nil {
			
 
				+			bootstrapErrors = append(bootstrapErrors, err)
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	return oHandles
			
@@ -197,6 +219,7 @@ func GetGPUInfo() GpuInfoList {
 
				 
			
 
				 	if !bootstrapped {
			
 
				 		slog.Info("looking for compatible GPUs")
			
 
				+		bootstrapErrors = []error{}
			
 
				 		needRefresh = false
			
 
				 		cpuCapability = GetCPUCapability()
			
 
				 		var memInfo C.mem_info_t
			
@@ -221,7 +244,9 @@ func GetGPUInfo() GpuInfoList {
 
				 
			
 
				 		// Fallback to CPU mode if we're lacking required vector extensions on x86
			
 
				 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
			
 
				-			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
			
 
				+			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
			
 
				+			slog.Warn(err.Error())
			
 
				+			bootstrapErrors = append(bootstrapErrors, err)
			
 
				 			bootstrapped = true
			
 
				 			// No need to do any GPU discovery, since we can't run on them
			
 
				 			return GpuInfoList{cpus[0].GpuInfo}
			
@@ -253,10 +278,6 @@ func GetGPUInfo() GpuInfoList {
 
				 					C.free(unsafe.Pointer(memInfo.err))
			
 
				 					continue
			
 
				 				}
			
 
				-				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
			
 
				-					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
			
 
				-					continue
			
 
				-				}
			
 
				 				gpuInfo.TotalMemory = uint64(memInfo.total)
			
 
				 				gpuInfo.FreeMemory = uint64(memInfo.free)
			
 
				 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
			
@@ -279,6 +300,15 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
			
 
				 				gpuInfo.Variant = variant
			
 
				 
			
 
				+				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
			
 
				+					unsupportedGPUs = append(unsupportedGPUs,
			
 
				+						UnsupportedGPUInfo{
			
 
				+							GpuInfo: gpuInfo.GpuInfo,
			
 
				+						})
			
 
				+					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
			
 
				+					continue
			
 
				+				}
			
 
				+
			
 
				 				// query the management library as well so we can record any skew between the two
			
 
				 				// which represents overhead on the GPU we must set aside on subsequent updates
			
 
				 				if cHandles.nvml != nil {
			
@@ -341,7 +371,10 @@ func GetGPUInfo() GpuInfoList {
 
				 			}
			
 
				 		}
			
 
				 
			
 
				-		rocmGPUs = AMDGetGPUInfo()
			
 
				+		rocmGPUs, err = AMDGetGPUInfo()
			
 
				+		if err != nil {
			
 
				+			bootstrapErrors = append(bootstrapErrors, err)
			
 
				+		}
			
 
				 		bootstrapped = true
			
 
				 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
			
 
				 			slog.Info("no compatible GPUs were discovered")
			
@@ -526,92 +559,114 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 
				 	return gpuLibPaths
			
 
				 }
			
 
				 
			
 
				-func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
			
 
				+// Bootstrap the runtime library
			
 
				+// Returns: num devices, handle, libPath, error
			
 
				+func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
			
 
				 	var resp C.cudart_init_resp_t
			
 
				 	resp.ch.verbose = getVerboseState()
			
 
				+	var err error
			
 
				 	for _, libPath := range cudartLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
 
				 		C.cudart_init(lib, &resp)
			
 
				 		if resp.err != nil {
			
 
				-			slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
			
 
				+			err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
			
 
				+			slog.Debug(err.Error())
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				-			return int(resp.num_devices), &resp.ch, libPath
			
 
				+			err = nil
			
 
				+			return int(resp.num_devices), &resp.ch, libPath, err
			
 
				 		}
			
 
				 	}
			
 
				-	return 0, nil, ""
			
 
				+	return 0, nil, "", err
			
 
				 }
			
 
				 
			
 
				-func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
			
 
				+// Bootstrap the driver library
			
 
				+// Returns: num devices, handle, libPath, error
			
 
				+func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
			
 
				 	var resp C.nvcuda_init_resp_t
			
 
				 	resp.ch.verbose = getVerboseState()
			
 
				+	var err error
			
 
				 	for _, libPath := range nvcudaLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
 
				 		C.nvcuda_init(lib, &resp)
			
 
				 		if resp.err != nil {
			
 
				 			// Decide what log level based on the type of error message to help users understand why
			
 
				-			msg := C.GoString(resp.err)
			
 
				 			switch resp.cudaErr {
			
 
				 			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
			
 
				-				slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
			
 
				+				err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
			
 
				+				slog.Warn(err.Error())
			
 
				 			case C.CUDA_ERROR_NO_DEVICE:
			
 
				-				slog.Info("no nvidia devices detected", "library", libPath)
			
 
				+				err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
			
 
				+				slog.Info(err.Error())
			
 
				 			case C.CUDA_ERROR_UNKNOWN:
			
 
				-				slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
			
 
				-				slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
			
 
				+				err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
			
 
				+				slog.Warn(err.Error())
			
 
				 			default:
			
 
				+				msg := C.GoString(resp.err)
			
 
				 				if strings.Contains(msg, "wrong ELF class") {
			
 
				 					slog.Debug("skipping 32bit library", "library", libPath)
			
 
				 				} else {
			
 
				-					slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
			
 
				+					err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
			
 
				+					slog.Info(err.Error())
			
 
				 				}
			
 
				 			}
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				-			return int(resp.num_devices), &resp.ch, libPath
			
 
				+			err = nil
			
 
				+			return int(resp.num_devices), &resp.ch, libPath, err
			
 
				 		}
			
 
				 	}
			
 
				-	return 0, nil, ""
			
 
				+	return 0, nil, "", err
			
 
				 }
			
 
				 
			
 
				-func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
			
 
				+// Bootstrap the management library
			
 
				+// Returns: handle, libPath, error
			
 
				+func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
			
 
				 	var resp C.nvml_init_resp_t
			
 
				 	resp.ch.verbose = getVerboseState()
			
 
				+	var err error
			
 
				 	for _, libPath := range nvmlLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
 
				 		C.nvml_init(lib, &resp)
			
 
				 		if resp.err != nil {
			
 
				-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
			
 
				+			err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
			
 
				+			slog.Info(err.Error())
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				-			return &resp.ch, libPath
			
 
				+			err = nil
			
 
				+			return &resp.ch, libPath, err
			
 
				 		}
			
 
				 	}
			
 
				-	return nil, ""
			
 
				+	return nil, "", err
			
 
				 }
			
 
				 
			
 
				-func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
			
 
				+// bootstrap the Intel GPU library
			
 
				+// Returns: num devices, handle, libPath, error
			
 
				+func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
			
 
				 	var resp C.oneapi_init_resp_t
			
 
				 	num_devices := 0
			
 
				 	resp.oh.verbose = getVerboseState()
			
 
				+	var err error
			
 
				 	for _, libPath := range oneapiLibPaths {
			
 
				 		lib := C.CString(libPath)
			
 
				 		defer C.free(unsafe.Pointer(lib))
			
 
				 		C.oneapi_init(lib, &resp)
			
 
				 		if resp.err != nil {
			
 
				-			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
			
 
				+			err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
			
 
				+			slog.Debug(err.Error())
			
 
				 			C.free(unsafe.Pointer(resp.err))
			
 
				 		} else {
			
 
				+			err = nil
			
 
				 			for i := range resp.oh.num_drivers {
			
 
				 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
			
 
				 			}
			
 
				-			return num_devices, &resp.oh, libPath
			
 
				+			return num_devices, &resp.oh, libPath, err
			
 
				 		}
			
 
				 	}
			
 
				-	return 0, nil, ""
			
 
				+	return 0, nil, "", err
			
 
				 }
			
 
				 
			
 
				 func getVerboseState() C.uint16_t {
			
@@ -669,3 +724,23 @@ func LibraryDir() string {
 
				 	slog.Warn("unable to locate gpu dependency libraries")
			
 
				 	return ""
			
 
				 }
			
 
				+
			
 
				+func GetSystemInfo() SystemInfo {
			
 
				+	gpus := GetGPUInfo()
			
 
				+	gpuMutex.Lock()
			
 
				+	defer gpuMutex.Unlock()
			
 
				+	discoveryErrors := []string{}
			
 
				+	for _, err := range bootstrapErrors {
			
 
				+		discoveryErrors = append(discoveryErrors, err.Error())
			
 
				+	}
			
 
				+	if len(gpus) == 1 && gpus[0].Library == "cpu" {
			
 
				+		gpus = []GpuInfo{}
			
 
				+	}
			
 
				+
			
 
				+	return SystemInfo{
			
 
				+		System:          cpus[0],
			
 
				+		GPUs:            gpus,
			
 
				+		UnsupportedGPUs: unsupportedGPUs,
			
 
				+		DiscoveryErrors: discoveryErrors,
			
 
				+	}
			
 
				+}
			
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -66,3 +66,15 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 
				 	// No-op on darwin
			
 
				 	return "", ""
			
 
				 }
			
 
				+
			
 
				+func GetSystemInfo() SystemInfo {
			
 
				+	mem, _ := GetCPUMem()
			
 
				+	return SystemInfo{
			
 
				+		System: CPUInfo{
			
 
				+			GpuInfo: GpuInfo{
			
 
				+				memInfo: mem,
			
 
				+			},
			
 
				+		},
			
 
				+		GPUs: GetGPUInfo(),
			
 
				+	}
			
 
				+}
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -76,6 +76,11 @@ type OneapiGPUInfoList []OneapiGPUInfo
 
				 
			
 
				 type GpuInfoList []GpuInfo
			
 
				 
			
 
				+type UnsupportedGPUInfo struct {
			
 
				+	GpuInfo
			
 
				+	Reason string `json:"reason"`
			
 
				+}
			
 
				+
			
 
				 // Split up the set of gpu info's by Library and variant
			
 
				 func (l GpuInfoList) ByLibrary() []GpuInfoList {
			
 
				 	resp := []GpuInfoList{}
			
@@ -146,3 +151,10 @@ func (c CPUCapability) String() string {
 
				 		return "no vector extensions"
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+type SystemInfo struct {
			
 
				+	System          CPUInfo              `json:"system"`
			
 
				+	GPUs            []GpuInfo            `json:"gpus"`
			
 
				+	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
			
 
				+	DiscoveryErrors []string             `json:"discovery_errors"`
			
 
				+}