amd_windows.go 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. package gpu
  2. import (
  3. "bytes"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "path/filepath"
  8. "slices"
  9. "strings"
  10. "github.com/ollama/ollama/format"
  11. )
  12. const (
  13. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  14. iGPUName = "AMD Radeon(TM) Graphics"
  15. )
  16. var (
  17. // Used to validate if the given ROCm lib is usable
  18. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
  19. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
  20. )
  21. func AMDGetGPUInfo() []GpuInfo {
  22. resp := []GpuInfo{}
  23. hl, err := NewHipLib()
  24. if err != nil {
  25. slog.Debug(err.Error())
  26. return nil
  27. }
  28. defer hl.Release()
  29. // TODO - this reports incorrect version information, so omitting for now
  30. // driverMajor, driverMinor, err := hl.AMDDriverVersion()
  31. // if err != nil {
  32. // // For now this is benign, but we may eventually need to fail compatibility checks
  33. // slog.Debug("error looking up amd driver version", "error", err)
  34. // }
  35. // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
  36. count := hl.HipGetDeviceCount()
  37. if count == 0 {
  38. return nil
  39. }
  40. libDir, err := AMDValidateLibDir()
  41. if err != nil {
  42. slog.Warn("unable to verify rocm library, will use cpu", "error", err)
  43. return nil
  44. }
  45. var supported []string
  46. gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
  47. if gfxOverride == "" {
  48. supported, err = GetSupportedGFX(libDir)
  49. if err != nil {
  50. slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
  51. return nil
  52. }
  53. } else {
  54. slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
  55. }
  56. slog.Debug("detected hip devices", "count", count)
  57. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  58. for i := range count {
  59. err = hl.HipSetDevice(i)
  60. if err != nil {
  61. slog.Warn("set device", "id", i, "error", err)
  62. continue
  63. }
  64. props, err := hl.HipGetDeviceProperties(i)
  65. if err != nil {
  66. slog.Warn("get properties", "id", i, "error", err)
  67. continue
  68. }
  69. n := bytes.IndexByte(props.Name[:], 0)
  70. name := string(props.Name[:n])
  71. // TODO is UUID actually populated on windows?
  72. // Can luid be used on windows for setting visible devices (and is it actually set?)
  73. n = bytes.IndexByte(props.GcnArchName[:], 0)
  74. gfx := string(props.GcnArchName[:n])
  75. slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
  76. //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  77. // TODO Why isn't props.iGPU accurate!?
  78. if strings.EqualFold(name, iGPUName) {
  79. slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
  80. continue
  81. }
  82. if gfxOverride == "" {
  83. if !slices.Contains[[]string, string](supported, gfx) {
  84. slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
  85. // TODO - consider discrete markdown just for ROCM troubleshooting?
  86. slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
  87. continue
  88. } else {
  89. slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  90. }
  91. }
  92. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  93. if err != nil {
  94. slog.Warn("get mem info", "id", i, "error", err)
  95. continue
  96. }
  97. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  98. if totalMemory < IGPUMemLimit {
  99. slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
  100. continue
  101. }
  102. // TODO revisit this once ROCm v6 is available on windows.
  103. // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
  104. slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  105. slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  106. gpuInfo := GpuInfo{
  107. Library: "rocm",
  108. memInfo: memInfo{
  109. TotalMemory: totalMemory,
  110. FreeMemory: freeMemory,
  111. },
  112. ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
  113. DependencyPath: libDir,
  114. MinimumMemory: rocmMinimumMemory,
  115. Name: name,
  116. Compute: gfx,
  117. // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
  118. // DriverMajor: driverMajor,
  119. // DriverMinor: driverMinor,
  120. }
  121. resp = append(resp, gpuInfo)
  122. }
  123. return resp
  124. }
  125. func AMDValidateLibDir() (string, error) {
  126. libDir, err := commonAMDValidateLibDir()
  127. if err == nil {
  128. return libDir, nil
  129. }
  130. // Installer payload (if we're running from some other location)
  131. localAppData := os.Getenv("LOCALAPPDATA")
  132. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  133. rocmTargetDir := filepath.Join(appDir, "rocm")
  134. if rocmLibUsable(rocmTargetDir) {
  135. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  136. return rocmTargetDir, nil
  137. }
  138. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  139. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  140. return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
  141. }