hace 6 meses · f3c8b898cd
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -47,10 +47,11 @@ var (
 
															 )
														
 
															 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
														
 
															-func AMDGetGPUInfo() []RocmGPUInfo {
														
 
															+// Only called once during bootstrap
														
 
															+func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
														
 
															 	resp := []RocmGPUInfo{}
														
 
															 	if !AMDDetected() {
														
 
															-		return resp
														
 
															+		return resp, fmt.Errorf("AMD GPUs not detected")
														
 
															 	}
														
 
															 	// Opportunistic logging of driver version to aid in troubleshooting
														
@@ -194,13 +195,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 		// Shouldn't happen, but just in case...
														
 
															 		if gpuID < 0 {
														
 
															-			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
														
 
															-			return nil
														
 
															-		}
														
 
															-
														
 
															-		if int(major) < RocmComputeMin {
														
 
															-			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
														
 
															-			continue
														
 
															+			err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
														
 
															+			slog.Error(err.Error())
														
 
															+			return nil, err
														
 
															 		}
														
 
															 		// Look up the memory for the current node
														
@@ -270,19 +267,12 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 			break
														
 
															 		}
														
 
															-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
														
 
															-		if totalMemory < IGPUMemLimit {
														
 
															-			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
														
 
															-			continue
														
 
															-		}
														
 
															 		var name string
														
 
															 		// TODO - PCI ID lookup
														
 
															 		if vendor > 0 && device > 0 {
														
 
															 			name = fmt.Sprintf("%04x:%04x", vendor, device)
														
 
															 		}
														
 
															-		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
														
 
															-		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
														
 
															 		gpuInfo := RocmGPUInfo{
														
 
															 			GpuInfo: GpuInfo{
														
 
															 				Library: "rocm",
														
@@ -300,6 +290,31 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 			usedFilepath: usedFile,
														
 
															 		}
														
 
															+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
														
 
															+		if totalMemory < IGPUMemLimit {
														
 
															+			reason := "unsupported Radeon iGPU detected skipping"
														
 
															+			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
														
 
															+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+				GpuInfo: gpuInfo.GpuInfo,
														
 
															+				Reason:  reason,
														
 
															+			})
														
 
															+			continue
														
 
															+		}
														
 
															+
														
 
															+		if int(major) < RocmComputeMin {
														
 
															+			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
														
 
															+			slog.Warn(reason, "gpu", gpuID)
														
 
															+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+				GpuInfo: gpuInfo.GpuInfo,
														
 
															+				Reason:  reason,
														
 
															+			})
														
 
															+
														
 
															+			continue
														
 
															+		}
														
 
															+
														
 
															+		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
														
 
															+		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
														
 
															+
														
 
															 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
														
 
															 		if len(visibleDevices) > 0 {
														
 
															 			include := false
														
@@ -310,7 +325,13 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 				}
														
 
															 			}
														
 
															 			if !include {
														
 
															-				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
														
 
															+				reason := "filtering out device per user request"
														
 
															+				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
														
 
															+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+					GpuInfo: gpuInfo.GpuInfo,
														
 
															+					Reason:  reason,
														
 
															+				})
														
 
															+
														
 
															 				continue
														
 
															 			}
														
 
															 		}
														
@@ -320,8 +341,13 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 		if libDir == "" {
														
 
															 			libDir, err = AMDValidateLibDir()
														
 
															 			if err != nil {
														
 
															-				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
														
 
															-				return nil
														
 
															+				err = fmt.Errorf("unable to verify rocm library: %w", err)
														
 
															+				slog.Warn(err.Error())
														
 
															+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+					GpuInfo: gpuInfo.GpuInfo,
														
 
															+					Reason:  err.Error(),
														
 
															+				})
														
 
															+				return nil, err
														
 
															 			}
														
 
															 		}
														
 
															 		gpuInfo.DependencyPath = libDir
														
@@ -331,14 +357,25 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 			if len(supported) == 0 {
														
 
															 				supported, err = GetSupportedGFX(libDir)
														
 
															 				if err != nil {
														
 
															-					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
														
 
															-					return nil
														
 
															+					err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
														
 
															+					slog.Warn(err.Error())
														
 
															+					unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+						GpuInfo: gpuInfo.GpuInfo,
														
 
															+						Reason:  err.Error(),
														
 
															+					})
														
 
															+					return nil, err
														
 
															 				}
														
 
															 				slog.Debug("rocm supported GPUs", "types", supported)
														
 
															 			}
														
 
															 			gfx := gpuInfo.Compute
														
 
															 			if !slices.Contains[[]string, string](supported, gfx) {
														
 
															-				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
														
 
															+				reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
														
 
															+				slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
														
 
															+				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+					GpuInfo: gpuInfo.GpuInfo,
														
 
															+					Reason:  reason,
														
 
															+				})
														
 
															+
														
 
															 				// TODO - consider discrete markdown just for ROCM troubleshooting?
														
 
															 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
														
 
															 				continue
														
@@ -358,13 +395,16 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 		resp = append(resp, gpuInfo)
														
 
															 	}
														
 
															 	if len(resp) == 0 {
														
 
															-		slog.Info("no compatible amdgpu devices detected")
														
 
															+		err := fmt.Errorf("no compatible amdgpu devices detected")
														
 
															+		slog.Info(err.Error())
														
 
															+		return nil, err
														
 
															 	}
														
 
															 	if err := verifyKFDDriverAccess(); err != nil {
														
 
															-		slog.Error("amdgpu devices detected but permission problems block access", "error", err)
														
 
															-		return nil
														
 
															+		err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
														
 
															+		slog.Error(err.Error())
														
 
															+		return nil, err
														
 
															 	}
														
 
															-	return resp
														
 
															+	return resp, nil
														
 
															 }
														
 
															 // Quick check for AMD driver so we can skip amdgpu discovery if not present
														
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -3,6 +3,7 @@ package gpu
 
															 import (
														
 
															 	"bytes"
														
 
															 	"errors"
														
 
															+	"fmt"
														
 
															 	"log/slog"
														
 
															 	"os"
														
 
															 	"path/filepath"
														
@@ -26,12 +27,13 @@ var (
 
															 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
														
 
															 )
														
 
															-func AMDGetGPUInfo() []RocmGPUInfo {
														
 
															+// Only called once during bootstrap
														
 
															+func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
														
 
															 	resp := []RocmGPUInfo{}
														
 
															 	hl, err := NewHipLib()
														
 
															 	if err != nil {
														
 
															 		slog.Debug(err.Error())
														
 
															-		return nil
														
 
															+		return nil, err
														
 
															 	}
														
 
															 	defer hl.Release()
														
@@ -44,12 +46,15 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
														
 
															 	count := hl.HipGetDeviceCount()
														
 
															 	if count == 0 {
														
 
															-		return nil
														
 
															+		err := fmt.Errorf("no compatible amdgpu devices detected")
														
 
															+		slog.Info(err.Error())
														
 
															+		return nil, err
														
 
															 	}
														
 
															 	libDir, err := AMDValidateLibDir()
														
 
															 	if err != nil {
														
 
															-		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
														
 
															-		return nil
														
 
															+		err = fmt.Errorf("unable to verify rocm library: %w", err)
														
 
															+		slog.Warn(err.Error())
														
 
															+		return nil, err
														
 
															 	}
														
 
															 	var supported []string
														
@@ -57,8 +62,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 	if gfxOverride == "" {
														
 
															 		supported, err = GetSupportedGFX(libDir)
														
 
															 		if err != nil {
														
 
															-			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
														
 
															-			return nil
														
 
															+			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
														
 
															+			slog.Warn(err.Error())
														
 
															+			return nil, err
														
 
															 		}
														
 
															 	} else {
														
 
															 		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
														
@@ -87,21 +93,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
														
 
															 		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
														
 
															 		// TODO  Why isn't props.iGPU accurate!?
														
 
															-		if strings.EqualFold(name, iGPUName) {
														
 
															-			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
														
 
															-			continue
														
 
															-		}
														
 
															-		if gfxOverride == "" {
														
 
															-			// Strip off Target Features when comparing
														
 
															-			if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
														
 
															-				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
														
 
															-				// TODO - consider discrete markdown just for ROCM troubleshooting?
														
 
															-				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
														
 
															-				continue
														
 
															-			} else {
														
 
															-				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
														
 
															-			}
														
 
															-		}
														
 
															 		freeMemory, totalMemory, err := hl.HipMemGetInfo()
														
 
															 		if err != nil {
														
@@ -109,14 +100,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 			continue
														
 
															 		}
														
 
															-		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
														
 
															-		if totalMemory < IGPUMemLimit {
														
 
															-			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
														
 
															-			continue
														
 
															-		}
														
 
															-
														
 
															-		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
														
 
															-		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
														
 
															 		gpuInfo := RocmGPUInfo{
														
 
															 			GpuInfo: GpuInfo{
														
 
															 				Library: "rocm",
														
@@ -138,10 +121,38 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
															 			index: i,
														
 
															 		}
														
 
															+		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
														
 
															+		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
														
 
															+			reason := "unsupported Radeon iGPU detected skipping"
														
 
															+			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
														
 
															+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+				GpuInfo: gpuInfo.GpuInfo,
														
 
															+				Reason:  reason,
														
 
															+			})
														
 
															+			continue
														
 
															+		}
														
 
															+
														
 
															+		// Strip off Target Features when comparing
														
 
															+		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
														
 
															+			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
														
 
															+			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
														
 
															+			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
														
 
															+				GpuInfo: gpuInfo.GpuInfo,
														
 
															+				Reason:  reason,
														
 
															+			})
														
 
															+			// HSA_OVERRIDE_GFX_VERSION not supported on windows
														
 
															+			continue
														
 
															+		} else {
														
 
															+			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
														
 
															+		}
														
 
															+
														
 
															+		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
														
 
															+		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
														
 
															+
														
 
															 		resp = append(resp, gpuInfo)
														
 
															 	}
														
 
															-	return resp
														
 
															+	return resp, nil
														
 
															 }
														
 
															 func AMDValidateLibDir() (string, error) {
														
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -54,6 +54,13 @@ var (
 
															 	nvmlLibPath   string
														
 
															 	rocmGPUs      []RocmGPUInfo
														
 
															 	oneapiGPUs    []OneapiGPUInfo
														
 
															+
														
 
															+	// If any discovered GPUs are incompatible, report why
														
 
															+	unsupportedGPUs []UnsupportedGPUInfo
														
 
															+
														
 
															+	// Keep track of errors during bootstrapping so that if GPUs are missing
														
 
															+	// they expected to be present this may explain why
														
 
															+	bootstrapErrors []error
														
 
															 )
														
 
															 // With our current CUDA compile flags, older than 5.0 will not work properly
														
@@ -70,16 +77,17 @@ func initCudaHandles() *cudaHandles {
 
															 	cHandles := &cudaHandles{}
														
 
															 	// Short Circuit if we already know which library to use
														
 
															+	// ignore bootstrap errors in this case since we already recorded them
														
 
															 	if nvmlLibPath != "" {
														
 
															-		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
														
 
															+		cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
														
 
															 		return cHandles
														
 
															 	}
														
 
															 	if nvcudaLibPath != "" {
														
 
															-		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
														
 
															+		cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
														
 
															 		return cHandles
														
 
															 	}
														
 
															 	if cudartLibPath != "" {
														
 
															-		cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
														
 
															+		cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
														
 
															 		return cHandles
														
 
															 	}
														
@@ -102,18 +110,21 @@ func initCudaHandles() *cudaHandles {
 
															 	if len(NvmlGlobs) > 0 {
														
 
															 		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
														
 
															 		if len(nvmlLibPaths) > 0 {
														
 
															-			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
														
 
															+			nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
														
 
															 			if nvml != nil {
														
 
															 				slog.Debug("nvidia-ml loaded", "library", libPath)
														
 
															 				cHandles.nvml = nvml
														
 
															 				nvmlLibPath = libPath
														
 
															 			}
														
 
															+			if err != nil {
														
 
															+				bootstrapErrors = append(bootstrapErrors, err)
														
 
															+			}
														
 
															 		}
														
 
															 	}
														
 
															 	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
														
 
															 	if len(nvcudaLibPaths) > 0 {
														
 
															-		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
														
 
															+		deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
														
 
															 		if nvcuda != nil {
														
 
															 			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
														
 
															 			cHandles.nvcuda = nvcuda
														
@@ -121,11 +132,14 @@ func initCudaHandles() *cudaHandles {
 
															 			nvcudaLibPath = libPath
														
 
															 			return cHandles
														
 
															 		}
														
 
															+		if err != nil {
														
 
															+			bootstrapErrors = append(bootstrapErrors, err)
														
 
															+		}
														
 
															 	}
														
 
															 	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
														
 
															 	if len(cudartLibPaths) > 0 {
														
 
															-		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
														
 
															+		deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
														
 
															 		if cudart != nil {
														
 
															 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
														
 
															 			cHandles.cudart = cudart
														
@@ -133,6 +147,9 @@ func initCudaHandles() *cudaHandles {
 
															 			cudartLibPath = libPath
														
 
															 			return cHandles
														
 
															 		}
														
 
															+		if err != nil {
														
 
															+			bootstrapErrors = append(bootstrapErrors, err)
														
 
															+		}
														
 
															 	}
														
 
															 	return cHandles
														
@@ -143,14 +160,19 @@ func initOneAPIHandles() *oneapiHandles {
 
															 	oHandles := &oneapiHandles{}
														
 
															 	// Short Circuit if we already know which library to use
														
 
															+	// ignore bootstrap errors in this case since we already recorded them
														
 
															 	if oneapiLibPath != "" {
														
 
															-		oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
														
 
															+		oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
														
 
															 		return oHandles
														
 
															 	}
														
 
															 	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
														
 
															 	if len(oneapiLibPaths) > 0 {
														
 
															-		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
														
 
															+		var err error
														
 
															+		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
														
 
															+		if err != nil {
														
 
															+			bootstrapErrors = append(bootstrapErrors, err)
														
 
															+		}
														
 
															 	}
														
 
															 	return oHandles
														
@@ -197,6 +219,7 @@ func GetGPUInfo() GpuInfoList {
 
															 	if !bootstrapped {
														
 
															 		slog.Info("looking for compatible GPUs")
														
 
															+		bootstrapErrors = []error{}
														
 
															 		needRefresh = false
														
 
															 		cpuCapability = GetCPUCapability()
														
 
															 		var memInfo C.mem_info_t
														
@@ -221,7 +244,9 @@ func GetGPUInfo() GpuInfoList {
 
															 		// Fallback to CPU mode if we're lacking required vector extensions on x86
														
 
															 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
														
 
															-			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
														
 
															+			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
														
 
															+			slog.Warn(err.Error())
														
 
															+			bootstrapErrors = append(bootstrapErrors, err)
														
 
															 			bootstrapped = true
														
 
															 			// No need to do any GPU discovery, since we can't run on them
														
 
															 			return GpuInfoList{cpus[0].GpuInfo}
														
@@ -253,10 +278,6 @@ func GetGPUInfo() GpuInfoList {
 
															 					C.free(unsafe.Pointer(memInfo.err))
														
 
															 					continue
														
 
															 				}
														
 
															-				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
														
 
															-					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
														
 
															-					continue
														
 
															-				}
														
 
															 				gpuInfo.TotalMemory = uint64(memInfo.total)
														
 
															 				gpuInfo.FreeMemory = uint64(memInfo.free)
														
 
															 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
														
@@ -279,6 +300,15 @@ func GetGPUInfo() GpuInfoList {
 
															 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
														
 
															 				gpuInfo.Variant = variant
														
 
															+				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
														
 
															+					unsupportedGPUs = append(unsupportedGPUs,
														
 
															+						UnsupportedGPUInfo{
														
 
															+							GpuInfo: gpuInfo.GpuInfo,
														
 
															+						})
														
 
															+					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
														
 
															+					continue
														
 
															+				}
														
 
															+
														
 
															 				// query the management library as well so we can record any skew between the two
														
 
															 				// which represents overhead on the GPU we must set aside on subsequent updates
														
 
															 				if cHandles.nvml != nil {
														
@@ -341,7 +371,10 @@ func GetGPUInfo() GpuInfoList {
 
															 			}
														
 
															 		}
														
 
															-		rocmGPUs = AMDGetGPUInfo()
														
 
															+		rocmGPUs, err = AMDGetGPUInfo()
														
 
															+		if err != nil {
														
 
															+			bootstrapErrors = append(bootstrapErrors, err)
														
 
															+		}
														
 
															 		bootstrapped = true
														
 
															 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
														
 
															 			slog.Info("no compatible GPUs were discovered")
														
@@ -526,92 +559,114 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 
															 	return gpuLibPaths
														
 
															 }
														
 
															-func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
														
 
															+// Bootstrap the runtime library
														
 
															+// Returns: num devices, handle, libPath, error
														
 
															+func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
														
 
															 	var resp C.cudart_init_resp_t
														
 
															 	resp.ch.verbose = getVerboseState()
														
 
															+	var err error
														
 
															 	for _, libPath := range cudartLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
 
															 		C.cudart_init(lib, &resp)
														
 
															 		if resp.err != nil {
														
 
															-			slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
														
 
															+			err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
														
 
															+			slog.Debug(err.Error())
														
 
															 			C.free(unsafe.Pointer(resp.err))
														
 
															 		} else {
														
 
															-			return int(resp.num_devices), &resp.ch, libPath
														
 
															+			err = nil
														
 
															+			return int(resp.num_devices), &resp.ch, libPath, err
														
 
															 		}
														
 
															 	}
														
 
															-	return 0, nil, ""
														
 
															+	return 0, nil, "", err
														
 
															 }
														
 
															-func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
														
 
															+// Bootstrap the driver library
														
 
															+// Returns: num devices, handle, libPath, error
														
 
															+func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
														
 
															 	var resp C.nvcuda_init_resp_t
														
 
															 	resp.ch.verbose = getVerboseState()
														
 
															+	var err error
														
 
															 	for _, libPath := range nvcudaLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
 
															 		C.nvcuda_init(lib, &resp)
														
 
															 		if resp.err != nil {
														
 
															 			// Decide what log level based on the type of error message to help users understand why
														
 
															-			msg := C.GoString(resp.err)
														
 
															 			switch resp.cudaErr {
														
 
															 			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
														
 
															-				slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
														
 
															+				err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
														
 
															+				slog.Warn(err.Error())
														
 
															 			case C.CUDA_ERROR_NO_DEVICE:
														
 
															-				slog.Info("no nvidia devices detected", "library", libPath)
														
 
															+				err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
														
 
															+				slog.Info(err.Error())
														
 
															 			case C.CUDA_ERROR_UNKNOWN:
														
 
															-				slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
														
 
															-				slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
														
 
															+				err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
														
 
															+				slog.Warn(err.Error())
														
 
															 			default:
														
 
															+				msg := C.GoString(resp.err)
														
 
															 				if strings.Contains(msg, "wrong ELF class") {
														
 
															 					slog.Debug("skipping 32bit library", "library", libPath)
														
 
															 				} else {
														
 
															-					slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
														
 
															+					err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
														
 
															+					slog.Info(err.Error())
														
 
															 				}
														
 
															 			}
														
 
															 			C.free(unsafe.Pointer(resp.err))
														
 
															 		} else {
														
 
															-			return int(resp.num_devices), &resp.ch, libPath
														
 
															+			err = nil
														
 
															+			return int(resp.num_devices), &resp.ch, libPath, err
														
 
															 		}
														
 
															 	}
														
 
															-	return 0, nil, ""
														
 
															+	return 0, nil, "", err
														
 
															 }
														
 
															-func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
														
 
															+// Bootstrap the management library
														
 
															+// Returns: handle, libPath, error
														
 
															+func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
														
 
															 	var resp C.nvml_init_resp_t
														
 
															 	resp.ch.verbose = getVerboseState()
														
 
															+	var err error
														
 
															 	for _, libPath := range nvmlLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
 
															 		C.nvml_init(lib, &resp)
														
 
															 		if resp.err != nil {
														
 
															-			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
														
 
															+			err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
														
 
															+			slog.Info(err.Error())
														
 
															 			C.free(unsafe.Pointer(resp.err))
														
 
															 		} else {
														
 
															-			return &resp.ch, libPath
														
 
															+			err = nil
														
 
															+			return &resp.ch, libPath, err
														
 
															 		}
														
 
															 	}
														
 
															-	return nil, ""
														
 
															+	return nil, "", err
														
 
															 }
														
 
															-func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
														
 
															+// bootstrap the Intel GPU library
														
 
															+// Returns: num devices, handle, libPath, error
														
 
															+func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
														
 
															 	var resp C.oneapi_init_resp_t
														
 
															 	num_devices := 0
														
 
															 	resp.oh.verbose = getVerboseState()
														
 
															+	var err error
														
 
															 	for _, libPath := range oneapiLibPaths {
														
 
															 		lib := C.CString(libPath)
														
 
															 		defer C.free(unsafe.Pointer(lib))
														
 
															 		C.oneapi_init(lib, &resp)
														
 
															 		if resp.err != nil {
														
 
															-			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
														
 
															+			err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
														
 
															+			slog.Debug(err.Error())
														
 
															 			C.free(unsafe.Pointer(resp.err))
														
 
															 		} else {
														
 
															+			err = nil
														
 
															 			for i := range resp.oh.num_drivers {
														
 
															 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
														
 
															 			}
														
 
															-			return num_devices, &resp.oh, libPath
														
 
															+			return num_devices, &resp.oh, libPath, err
														
 
															 		}
														
 
															 	}
														
 
															-	return 0, nil, ""
														
 
															+	return 0, nil, "", err
														
 
															 }
														
 
															 func getVerboseState() C.uint16_t {
														
@@ -669,3 +724,23 @@ func LibraryDir() string {
 
															 	slog.Warn("unable to locate gpu dependency libraries")
														
 
															 	return ""
														
 
															 }
														
 
															+
														
 
															+func GetSystemInfo() SystemInfo {
														
 
															+	gpus := GetGPUInfo()
														
 
															+	gpuMutex.Lock()
														
 
															+	defer gpuMutex.Unlock()
														
 
															+	discoveryErrors := []string{}
														
 
															+	for _, err := range bootstrapErrors {
														
 
															+		discoveryErrors = append(discoveryErrors, err.Error())
														
 
															+	}
														
 
															+	if len(gpus) == 1 && gpus[0].Library == "cpu" {
														
 
															+		gpus = []GpuInfo{}
														
 
															+	}
														
 
															+
														
 
															+	return SystemInfo{
														
 
															+		System:          cpus[0],
														
 
															+		GPUs:            gpus,
														
 
															+		UnsupportedGPUs: unsupportedGPUs,
														
 
															+		DiscoveryErrors: discoveryErrors,
														
 
															+	}
														
 
															+}
														
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -66,3 +66,15 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 
															 	// No-op on darwin
														
 
															 	return "", ""
														
 
															 }
														
 
															+
														
 
															+func GetSystemInfo() SystemInfo {
														
 
															+	mem, _ := GetCPUMem()
														
 
															+	return SystemInfo{
														
 
															+		System: CPUInfo{
														
 
															+			GpuInfo: GpuInfo{
														
 
															+				memInfo: mem,
														
 
															+			},
														
 
															+		},
														
 
															+		GPUs: GetGPUInfo(),
														
 
															+	}
														
 
															+}
														
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -76,6 +76,11 @@ type OneapiGPUInfoList []OneapiGPUInfo
 
															 type GpuInfoList []GpuInfo
														
 
															+type UnsupportedGPUInfo struct {
														
 
															+	GpuInfo
														
 
															+	Reason string `json:"reason"`
														
 
															+}
														
 
															+
														
 
															 // Split up the set of gpu info's by Library and variant
														
 
															 func (l GpuInfoList) ByLibrary() []GpuInfoList {
														
 
															 	resp := []GpuInfoList{}
														
@@ -146,3 +151,10 @@ func (c CPUCapability) String() string {
 
															 		return "no vector extensions"
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+type SystemInfo struct {
														
 
															+	System          CPUInfo              `json:"system"`
														
 
															+	GPUs            []GpuInfo            `json:"gpus"`
														
 
															+	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
														
 
															+	DiscoveryErrors []string             `json:"discovery_errors"`
														
 
															+}