amd_windows.go 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. package discover
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "log/slog"
  7. "path/filepath"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "github.com/ollama/ollama/envconfig"
  12. "github.com/ollama/ollama/format"
  13. )
  14. const (
  15. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  16. iGPUName = "AMD Radeon(TM) Graphics"
  17. )
  18. var (
  19. // Used to validate if the given ROCm lib is usable
  20. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
  21. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
  22. )
  23. // Only called once during bootstrap
  24. func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
  25. resp := []RocmGPUInfo{}
  26. hl, err := NewHipLib()
  27. if err != nil {
  28. slog.Debug(err.Error())
  29. return nil, err
  30. }
  31. defer hl.Release()
  32. driverMajor, driverMinor, err := hl.AMDDriverVersion()
  33. if err != nil {
  34. // For now this is benign, but we may eventually need to fail compatibility checks
  35. slog.Debug("error looking up amd driver version", "error", err)
  36. }
  37. // Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
  38. count := hl.HipGetDeviceCount()
  39. if count == 0 {
  40. err := fmt.Errorf("no compatible amdgpu devices detected")
  41. slog.Info(err.Error())
  42. return nil, err
  43. }
  44. libDir, err := AMDValidateLibDir()
  45. if err != nil {
  46. err = fmt.Errorf("unable to verify rocm library: %w", err)
  47. slog.Warn(err.Error())
  48. return nil, err
  49. }
  50. var supported []string
  51. gfxOverride := envconfig.HsaOverrideGfxVersion()
  52. if gfxOverride == "" {
  53. supported, err = GetSupportedGFX(libDir)
  54. if err != nil {
  55. err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
  56. slog.Warn(err.Error())
  57. return nil, err
  58. }
  59. } else {
  60. slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
  61. }
  62. slog.Debug("detected hip devices", "count", count)
  63. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  64. for i := range count {
  65. err = hl.HipSetDevice(i)
  66. if err != nil {
  67. slog.Warn("set device", "id", i, "error", err)
  68. continue
  69. }
  70. props, err := hl.HipGetDeviceProperties(i)
  71. if err != nil {
  72. slog.Warn("get properties", "id", i, "error", err)
  73. continue
  74. }
  75. n := bytes.IndexByte(props.Name[:], 0)
  76. name := string(props.Name[:n])
  77. // TODO is UUID actually populated on windows?
  78. // Can luid be used on windows for setting visible devices (and is it actually set?)
  79. n = bytes.IndexByte(props.GcnArchName[:], 0)
  80. gfx := string(props.GcnArchName[:n])
  81. slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
  82. // slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  83. // TODO Why isn't props.iGPU accurate!?
  84. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  85. if err != nil {
  86. slog.Warn("get mem info", "id", i, "error", err)
  87. continue
  88. }
  89. gpuInfo := RocmGPUInfo{
  90. GpuInfo: GpuInfo{
  91. Library: "rocm",
  92. memInfo: memInfo{
  93. TotalMemory: totalMemory,
  94. FreeMemory: freeMemory,
  95. },
  96. // Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
  97. UnreliableFreeMemory: true,
  98. ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
  99. DependencyPath: []string{libDir},
  100. MinimumMemory: rocmMinimumMemory,
  101. Name: name,
  102. Compute: gfx,
  103. DriverMajor: driverMajor,
  104. DriverMinor: driverMinor,
  105. },
  106. index: i,
  107. }
  108. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  109. if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
  110. reason := "unsupported Radeon iGPU detected skipping"
  111. slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
  112. unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
  113. GpuInfo: gpuInfo.GpuInfo,
  114. Reason: reason,
  115. })
  116. continue
  117. }
  118. // Strip off Target Features when comparing
  119. if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
  120. reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
  121. slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
  122. unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
  123. GpuInfo: gpuInfo.GpuInfo,
  124. Reason: reason,
  125. })
  126. // HSA_OVERRIDE_GFX_VERSION not supported on windows
  127. continue
  128. } else {
  129. slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  130. }
  131. slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  132. slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  133. resp = append(resp, gpuInfo)
  134. }
  135. return resp, nil
  136. }
  137. func AMDValidateLibDir() (string, error) {
  138. libDir, err := commonAMDValidateLibDir()
  139. if err == nil {
  140. return libDir, nil
  141. }
  142. // Installer payload (if we're running from some other location)
  143. rocmTargetDir := filepath.Join(LibOllamaPath, "rocm")
  144. if rocmLibUsable(rocmTargetDir) {
  145. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  146. return rocmTargetDir, nil
  147. }
  148. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  149. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  150. return "", errors.New("no suitable rocm found, falling back to CPU")
  151. }
  152. func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
  153. if len(gpus) == 0 {
  154. return nil
  155. }
  156. hl, err := NewHipLib()
  157. if err != nil {
  158. slog.Debug(err.Error())
  159. return err
  160. }
  161. defer hl.Release()
  162. for i := range gpus {
  163. err := hl.HipSetDevice(gpus[i].index)
  164. if err != nil {
  165. return err
  166. }
  167. freeMemory, _, err := hl.HipMemGetInfo()
  168. if err != nil {
  169. slog.Warn("get mem info", "id", i, "error", err)
  170. continue
  171. }
  172. slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
  173. gpus[i].FreeMemory = freeMemory
  174. }
  175. return nil
  176. }
  177. func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
  178. ids := []string{}
  179. for _, info := range gpuInfo {
  180. if info.Library != "rocm" {
  181. // TODO shouldn't happen if things are wired correctly...
  182. slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
  183. continue
  184. }
  185. ids = append(ids, info.ID)
  186. }
  187. // There are 3 potential env vars to use to select GPUs.
  188. // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
  189. // HIP_VISIBLE_DEVICES supports numeric IDs only
  190. // GPU_DEVICE_ORDINAL supports numeric IDs only
  191. return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
  192. }