amd_windows.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. package gpu
  2. import (
  3. "bytes"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "path/filepath"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "github.com/ollama/ollama/format"
  12. )
  13. const (
  14. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  15. iGPUName = "AMD Radeon(TM) Graphics"
  16. )
  17. var (
  18. // Used to validate if the given ROCm lib is usable
  19. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
  20. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
  21. )
  22. func AMDGetGPUInfo() []GpuInfo {
  23. resp := []GpuInfo{}
  24. hl, err := NewHipLib()
  25. if err != nil {
  26. slog.Debug(err.Error())
  27. return nil
  28. }
  29. defer hl.Release()
  30. ver, err := hl.AMDDriverVersion()
  31. if err == nil {
  32. slog.Info("AMD Driver: " + ver)
  33. } else {
  34. // For now this is benign, but we may eventually need to fail compatibility checks
  35. slog.Debug("error looking up amd driver version", "error", err)
  36. }
  37. // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
  38. count := hl.HipGetDeviceCount()
  39. if count == 0 {
  40. return nil
  41. }
  42. libDir, err := AMDValidateLibDir()
  43. if err != nil {
  44. slog.Warn("unable to verify rocm library, will use cpu", "error", err)
  45. return nil
  46. }
  47. var supported []string
  48. gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
  49. if gfxOverride == "" {
  50. supported, err = GetSupportedGFX(libDir)
  51. if err != nil {
  52. slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
  53. return nil
  54. }
  55. } else {
  56. slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
  57. }
  58. slog.Info("detected hip devices", "count", count)
  59. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  60. for i := 0; i < count; i++ {
  61. err = hl.HipSetDevice(i)
  62. if err != nil {
  63. slog.Warn("set device", "id", i, "error", err)
  64. continue
  65. }
  66. props, err := hl.HipGetDeviceProperties(i)
  67. if err != nil {
  68. slog.Warn("get properties", "id", i, "error", err)
  69. continue
  70. }
  71. n := bytes.IndexByte(props.Name[:], 0)
  72. name := string(props.Name[:n])
  73. // TODO is UUID actually populated on windows?
  74. // Can luid be used on windows for setting visible devices (and is it actually set?)
  75. n = bytes.IndexByte(props.GcnArchName[:], 0)
  76. gfx := string(props.GcnArchName[:n])
  77. slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
  78. var major, minor, patch string
  79. switch len(gfx) {
  80. case 6:
  81. major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
  82. case 7:
  83. major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
  84. }
  85. //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  86. // TODO Why isn't props.iGPU accurate!?
  87. if strings.EqualFold(name, iGPUName) {
  88. slog.Info("iGPU detected skipping", "id", i)
  89. continue
  90. }
  91. if gfxOverride == "" {
  92. if !slices.Contains[[]string, string](supported, gfx) {
  93. slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
  94. // TODO - consider discrete markdown just for ROCM troubleshooting?
  95. slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
  96. continue
  97. } else {
  98. slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  99. }
  100. }
  101. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  102. if err != nil {
  103. slog.Warn("get mem info", "id", i, "error", err)
  104. continue
  105. }
  106. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  107. if totalMemory < IGPUMemLimit {
  108. slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
  109. continue
  110. }
  111. // TODO revisit this once ROCm v6 is available on windows.
  112. // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
  113. slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  114. slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  115. gpuInfo := GpuInfo{
  116. Library: "rocm",
  117. memInfo: memInfo{
  118. TotalMemory: totalMemory,
  119. FreeMemory: freeMemory,
  120. },
  121. ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
  122. DependencyPath: libDir,
  123. MinimumMemory: rocmMinimumMemory,
  124. }
  125. if major != "" {
  126. gpuInfo.Major, err = strconv.Atoi(major)
  127. if err != nil {
  128. slog.Info("failed to parse version", "version", gfx, "error", err)
  129. }
  130. }
  131. if minor != "" {
  132. gpuInfo.Minor, err = strconv.Atoi(minor)
  133. if err != nil {
  134. slog.Info("failed to parse version", "version", gfx, "error", err)
  135. }
  136. }
  137. if patch != "" {
  138. // Patch rev is hex; e.g. gfx90a
  139. p, err := strconv.ParseInt(patch, 16, 0)
  140. if err != nil {
  141. slog.Info("failed to parse version", "version", gfx, "error", err)
  142. } else {
  143. gpuInfo.Patch = int(p)
  144. }
  145. }
  146. if gpuInfo.Major < RocmComputeMin {
  147. slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
  148. continue
  149. }
  150. resp = append(resp, gpuInfo)
  151. }
  152. return resp
  153. }
  154. func AMDValidateLibDir() (string, error) {
  155. libDir, err := commonAMDValidateLibDir()
  156. if err == nil {
  157. return libDir, nil
  158. }
  159. // Installer payload (if we're running from some other location)
  160. localAppData := os.Getenv("LOCALAPPDATA")
  161. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  162. rocmTargetDir := filepath.Join(appDir, "rocm")
  163. if rocmLibUsable(rocmTargetDir) {
  164. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  165. return rocmTargetDir, nil
  166. }
  167. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  168. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  169. return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
  170. }