amd_windows.go 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. package discover
  2. import (
  3. "bytes"
  4. "errors"
  5. "fmt"
  6. "log/slog"
  7. "os"
  8. "path/filepath"
  9. "slices"
  10. "strconv"
  11. "strings"
  12. "github.com/ollama/ollama/envconfig"
  13. "github.com/ollama/ollama/format"
  14. )
  15. const (
  16. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  17. iGPUName = "AMD Radeon(TM) Graphics"
  18. )
  19. var (
  20. // Used to validate if the given ROCm lib is usable
  21. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // This is not sufficient to discern v5 vs v6
  22. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
  23. )
  24. // Only called once during bootstrap
  25. func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
  26. resp := []RocmGPUInfo{}
  27. hl, err := NewHipLib()
  28. if err != nil {
  29. slog.Debug(err.Error())
  30. return nil, err
  31. }
  32. defer hl.Release()
  33. driverMajor, driverMinor, err := hl.AMDDriverVersion()
  34. if err != nil {
  35. // For now this is benign, but we may eventually need to fail compatibility checks
  36. slog.Debug("error looking up amd driver version", "error", err)
  37. }
  38. // Note: the HIP library automatically handles subsetting to any *_VISIBLE_DEVICES the user specified
  39. count := hl.HipGetDeviceCount()
  40. if count == 0 {
  41. err := fmt.Errorf("no compatible amdgpu devices detected")
  42. slog.Info(err.Error())
  43. return nil, err
  44. }
  45. depPaths := LibraryDirs()
  46. libDir, err := AMDValidateLibDir()
  47. if err != nil {
  48. err = fmt.Errorf("unable to verify rocm library: %w", err)
  49. slog.Warn(err.Error())
  50. return nil, err
  51. }
  52. depPaths = append(depPaths, libDir)
  53. var supported []string
  54. gfxOverride := envconfig.HsaOverrideGfxVersion()
  55. if gfxOverride == "" {
  56. supported, err = GetSupportedGFX(libDir)
  57. if err != nil {
  58. err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
  59. slog.Warn(err.Error())
  60. return nil, err
  61. }
  62. } else {
  63. slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
  64. }
  65. slog.Debug("detected hip devices", "count", count)
  66. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  67. for i := range count {
  68. err = hl.HipSetDevice(i)
  69. if err != nil {
  70. slog.Warn("set device", "id", i, "error", err)
  71. continue
  72. }
  73. props, err := hl.HipGetDeviceProperties(i)
  74. if err != nil {
  75. slog.Warn("get properties", "id", i, "error", err)
  76. continue
  77. }
  78. n := bytes.IndexByte(props.Name[:], 0)
  79. name := string(props.Name[:n])
  80. // TODO is UUID actually populated on windows?
  81. // Can luid be used on windows for setting visible devices (and is it actually set?)
  82. n = bytes.IndexByte(props.GcnArchName[:], 0)
  83. gfx := string(props.GcnArchName[:n])
  84. slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
  85. // slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  86. // TODO Why isn't props.iGPU accurate!?
  87. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  88. if err != nil {
  89. slog.Warn("get mem info", "id", i, "error", err)
  90. continue
  91. }
  92. gpuInfo := RocmGPUInfo{
  93. GpuInfo: GpuInfo{
  94. Library: "rocm",
  95. memInfo: memInfo{
  96. TotalMemory: totalMemory,
  97. FreeMemory: freeMemory,
  98. },
  99. // Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
  100. UnreliableFreeMemory: true,
  101. ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
  102. DependencyPath: depPaths,
  103. MinimumMemory: rocmMinimumMemory,
  104. Name: name,
  105. Compute: gfx,
  106. DriverMajor: driverMajor,
  107. DriverMinor: driverMinor,
  108. },
  109. index: i,
  110. }
  111. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  112. if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
  113. reason := "unsupported Radeon iGPU detected skipping"
  114. slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
  115. unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
  116. GpuInfo: gpuInfo.GpuInfo,
  117. Reason: reason,
  118. })
  119. continue
  120. }
  121. // Strip off Target Features when comparing
  122. if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
  123. reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
  124. slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
  125. unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
  126. GpuInfo: gpuInfo.GpuInfo,
  127. Reason: reason,
  128. })
  129. // HSA_OVERRIDE_GFX_VERSION not supported on windows
  130. continue
  131. } else {
  132. slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  133. }
  134. slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  135. slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  136. resp = append(resp, gpuInfo)
  137. }
  138. return resp, nil
  139. }
  140. func AMDValidateLibDir() (string, error) {
  141. libDir, err := commonAMDValidateLibDir()
  142. if err == nil {
  143. return libDir, nil
  144. }
  145. // Installer payload (if we're running from some other location)
  146. localAppData := os.Getenv("LOCALAPPDATA")
  147. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  148. rocmTargetDir := filepath.Join(appDir, envconfig.LibRelativeToExe(), "lib", "ollama")
  149. if rocmLibUsable(rocmTargetDir) {
  150. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  151. return rocmTargetDir, nil
  152. }
  153. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  154. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  155. return "", errors.New("no suitable rocm found, falling back to CPU")
  156. }
  157. func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
  158. if len(gpus) == 0 {
  159. return nil
  160. }
  161. hl, err := NewHipLib()
  162. if err != nil {
  163. slog.Debug(err.Error())
  164. return err
  165. }
  166. defer hl.Release()
  167. for i := range gpus {
  168. err := hl.HipSetDevice(gpus[i].index)
  169. if err != nil {
  170. return err
  171. }
  172. freeMemory, _, err := hl.HipMemGetInfo()
  173. if err != nil {
  174. slog.Warn("get mem info", "id", i, "error", err)
  175. continue
  176. }
  177. slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
  178. gpus[i].FreeMemory = freeMemory
  179. }
  180. return nil
  181. }
  182. func rocmGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
  183. ids := []string{}
  184. for _, info := range gpuInfo {
  185. if info.Library != "rocm" {
  186. // TODO shouldn't happen if things are wired correctly...
  187. slog.Debug("rocmGetVisibleDevicesEnv skipping over non-rocm device", "library", info.Library)
  188. continue
  189. }
  190. ids = append(ids, info.ID)
  191. }
  192. // There are 3 potential env vars to use to select GPUs.
  193. // ROCR_VISIBLE_DEVICES supports UUID or numeric but does not work on Windows
  194. // HIP_VISIBLE_DEVICES supports numeric IDs only
  195. // GPU_DEVICE_ORDINAL supports numeric IDs only
  196. return "HIP_VISIBLE_DEVICES", strings.Join(ids, ",")
  197. }