amd_windows.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. package gpu
  2. import (
  3. "bytes"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "path/filepath"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "github.com/ollama/ollama/envconfig"
  12. "github.com/ollama/ollama/format"
  13. )
  14. const (
  15. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  16. iGPUName = "AMD Radeon(TM) Graphics"
  17. )
  18. var (
  19. // Used to validate if the given ROCm lib is usable
  20. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
  21. RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\5.7\\bin"} // TODO glob?
  22. )
  23. func AMDGetGPUInfo() []RocmGPUInfo {
  24. resp := []RocmGPUInfo{}
  25. hl, err := NewHipLib()
  26. if err != nil {
  27. slog.Debug(err.Error())
  28. return nil
  29. }
  30. defer hl.Release()
  31. // TODO - this reports incorrect version information, so omitting for now
  32. // driverMajor, driverMinor, err := hl.AMDDriverVersion()
  33. // if err != nil {
  34. // // For now this is benign, but we may eventually need to fail compatibility checks
  35. // slog.Debug("error looking up amd driver version", "error", err)
  36. // }
  37. // Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
  38. count := hl.HipGetDeviceCount()
  39. if count == 0 {
  40. return nil
  41. }
  42. libDir, err := AMDValidateLibDir()
  43. if err != nil {
  44. slog.Warn("unable to verify rocm library, will use cpu", "error", err)
  45. return nil
  46. }
  47. var supported []string
  48. gfxOverride := envconfig.HsaOverrideGfxVersion
  49. if gfxOverride == "" {
  50. supported, err = GetSupportedGFX(libDir)
  51. if err != nil {
  52. slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
  53. return nil
  54. }
  55. } else {
  56. slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
  57. }
  58. slog.Debug("detected hip devices", "count", count)
  59. // TODO how to determine the underlying device ID when visible devices is causing this to subset?
  60. for i := range count {
  61. err = hl.HipSetDevice(i)
  62. if err != nil {
  63. slog.Warn("set device", "id", i, "error", err)
  64. continue
  65. }
  66. props, err := hl.HipGetDeviceProperties(i)
  67. if err != nil {
  68. slog.Warn("get properties", "id", i, "error", err)
  69. continue
  70. }
  71. n := bytes.IndexByte(props.Name[:], 0)
  72. name := string(props.Name[:n])
  73. // TODO is UUID actually populated on windows?
  74. // Can luid be used on windows for setting visible devices (and is it actually set?)
  75. n = bytes.IndexByte(props.GcnArchName[:], 0)
  76. gfx := string(props.GcnArchName[:n])
  77. slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
  78. //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  79. // TODO Why isn't props.iGPU accurate!?
  80. if strings.EqualFold(name, iGPUName) {
  81. slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
  82. continue
  83. }
  84. if gfxOverride == "" {
  85. if !slices.Contains[[]string, string](supported, gfx) {
  86. slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
  87. // TODO - consider discrete markdown just for ROCM troubleshooting?
  88. slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
  89. continue
  90. } else {
  91. slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
  92. }
  93. }
  94. freeMemory, totalMemory, err := hl.HipMemGetInfo()
  95. if err != nil {
  96. slog.Warn("get mem info", "id", i, "error", err)
  97. continue
  98. }
  99. // iGPU detection, remove this check once we can support an iGPU variant of the rocm library
  100. if totalMemory < IGPUMemLimit {
  101. slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
  102. continue
  103. }
  104. // TODO revisit this once ROCm v6 is available on windows.
  105. // v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
  106. slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
  107. slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
  108. gpuInfo := RocmGPUInfo{
  109. GpuInfo: GpuInfo{
  110. Library: "rocm",
  111. memInfo: memInfo{
  112. TotalMemory: totalMemory,
  113. FreeMemory: freeMemory,
  114. },
  115. ID: strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
  116. DependencyPath: libDir,
  117. MinimumMemory: rocmMinimumMemory,
  118. Name: name,
  119. Compute: gfx,
  120. // TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
  121. // DriverMajor: driverMajor,
  122. // DriverMinor: driverMinor,
  123. },
  124. index: i,
  125. }
  126. resp = append(resp, gpuInfo)
  127. }
  128. return resp
  129. }
  130. func AMDValidateLibDir() (string, error) {
  131. libDir, err := commonAMDValidateLibDir()
  132. if err == nil {
  133. return libDir, nil
  134. }
  135. // Installer payload (if we're running from some other location)
  136. localAppData := os.Getenv("LOCALAPPDATA")
  137. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  138. rocmTargetDir := filepath.Join(appDir, "rocm")
  139. if rocmLibUsable(rocmTargetDir) {
  140. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  141. return rocmTargetDir, nil
  142. }
  143. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  144. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm")
  145. return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
  146. }
  147. func (gpus RocmGPUInfoList) RefreshFreeMemory() error {
  148. if len(gpus) == 0 {
  149. return nil
  150. }
  151. hl, err := NewHipLib()
  152. if err != nil {
  153. slog.Debug(err.Error())
  154. return nil
  155. }
  156. defer hl.Release()
  157. for i := range gpus {
  158. err := hl.HipSetDevice(gpus[i].index)
  159. if err != nil {
  160. return err
  161. }
  162. freeMemory, _, err := hl.HipMemGetInfo()
  163. if err != nil {
  164. slog.Warn("get mem info", "id", i, "error", err)
  165. continue
  166. }
  167. slog.Debug("updating rocm free memory", "gpu", gpus[i].ID, "name", gpus[i].Name, "before", format.HumanBytes2(gpus[i].FreeMemory), "now", format.HumanBytes2(freeMemory))
  168. gpus[i].FreeMemory = freeMemory
  169. }
  170. return nil
  171. }