amd_windows.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. package gpu
  2. import (
  3. "bytes"
  4. "fmt"
  5. "log/slog"
  6. "os"
  7. "path/filepath"
  8. "slices"
  9. "strings"
  10. )
  11. const (
  12. RocmStandardLocation = "C:\\Program Files\\AMD\\ROCm\\5.7\\bin" // TODO glob?
  13. // TODO We're lookinng for this exact name to detect iGPUs since hipGetDeviceProperties never reports integrated==true
  14. iGPUName = "AMD Radeon(TM) Graphics"
  15. )
  16. var (
  17. // Used to validate if the given ROCm lib is usable
  18. ROCmLibGlobs = []string{"hipblas.dll", "rocblas"} // TODO - probably include more coverage of files here...
  19. )
  20. func AMDGetGPUInfo(resp *GpuInfo) {
  21. hl, err := NewHipLib()
  22. if err != nil {
  23. slog.Debug(err.Error())
  24. return
  25. }
  26. defer hl.Release()
  27. skip := map[int]interface{}{}
  28. ids := []int{}
  29. resp.memInfo.DeviceCount = 0
  30. resp.memInfo.TotalMemory = 0
  31. resp.memInfo.FreeMemory = 0
  32. ver, err := hl.AMDDriverVersion()
  33. if err == nil {
  34. slog.Info("AMD Driver: " + ver)
  35. } else {
  36. // For now this is benign, but we may eventually need to fail compatibility checks
  37. slog.Debug(fmt.Sprintf("error looking up amd driver version: %s", err))
  38. }
  39. // Note: the HIP library automatically handles HIP_VISIBLE_DEVICES
  40. count := hl.HipGetDeviceCount()
  41. if count == 0 {
  42. return
  43. }
  44. libDir, err := AMDValidateLibDir()
  45. if err != nil {
  46. slog.Warn(fmt.Sprintf("unable to verify rocm library, will use cpu: %s", err))
  47. return
  48. }
  49. var supported []string
  50. gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
  51. if gfxOverride == "" {
  52. supported, err = GetSupportedGFX(libDir)
  53. if err != nil {
  54. slog.Warn(fmt.Sprintf("failed to lookup supported GFX types, falling back to CPU mode: %s", err))
  55. return
  56. }
  57. } else {
  58. slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
  59. }
  60. slog.Info(fmt.Sprintf("detected %d hip devices", count))
  61. for i := 0; i < count; i++ {
  62. ids = append(ids, i)
  63. err = hl.HipSetDevice(i)
  64. if err != nil {
  65. slog.Warn(fmt.Sprintf("[%d] %s", i, err))
  66. skip[i] = struct{}{}
  67. continue
  68. }
  69. props, err := hl.HipGetDeviceProperties(i)
  70. if err != nil {
  71. slog.Warn(fmt.Sprintf("[%d] %s", i, err))
  72. skip[i] = struct{}{}
  73. continue
  74. }
  75. n := bytes.IndexByte(props.Name[:], 0)
  76. name := string(props.Name[:n])
  77. slog.Info(fmt.Sprintf("[%d] Name: %s", i, name))
  78. n = bytes.IndexByte(props.GcnArchName[:], 0)
  79. gfx := string(props.GcnArchName[:n])
  80. slog.Info(fmt.Sprintf("[%d] GcnArchName: %s", i, gfx))
  81. //slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
  82. // TODO Why isn't props.iGPU accurate!?
  83. if strings.EqualFold(name, iGPUName) {
  84. slog.Info(fmt.Sprintf("iGPU detected [%d] skipping", i))
  85. skip[i] = struct{}{}
  86. continue
  87. }
  88. if gfxOverride == "" {
  89. if !slices.Contains[[]string, string](supported, gfx) {
  90. slog.Warn(fmt.Sprintf("amdgpu [%d] %s is not supported by %s %v", i, gfx, libDir, supported))
  91. // TODO - consider discrete markdown just for ROCM troubleshooting?
  92. slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
  93. skip[i] = struct{}{}
  94. continue
  95. } else {
  96. slog.Info(fmt.Sprintf("amdgpu [%d] %s is supported", i, gfx))
  97. }
  98. }
  99. totalMemory, freeMemory, err := hl.HipMemGetInfo()
  100. if err != nil {
  101. slog.Warn(fmt.Sprintf("[%d] %s", i, err))
  102. continue
  103. }
  104. // TODO according to docs, freeMem may lie on windows!
  105. slog.Info(fmt.Sprintf("[%d] Total Mem: %d", i, totalMemory))
  106. slog.Info(fmt.Sprintf("[%d] Free Mem: %d", i, freeMemory))
  107. resp.memInfo.DeviceCount++
  108. resp.memInfo.TotalMemory += totalMemory
  109. resp.memInfo.FreeMemory += freeMemory
  110. }
  111. if resp.memInfo.DeviceCount > 0 {
  112. resp.Library = "rocm"
  113. }
  114. // Abort if all GPUs are skipped
  115. if len(skip) >= count {
  116. slog.Info("all detected amdgpus are skipped, falling back to CPU")
  117. return
  118. }
  119. if len(skip) > 0 {
  120. amdSetVisibleDevices(ids, skip)
  121. }
  122. UpdatePath(libDir)
  123. }
  124. func AMDValidateLibDir() (string, error) {
  125. // On windows non-admins typically can't create links
  126. // so instead of trying to rely on rpath and a link in
  127. // $LibDir/rocm, we instead rely on setting PATH to point
  128. // to the location of the ROCm library
  129. // Installer payload location
  130. exe, err := os.Executable()
  131. if err == nil {
  132. rocmTargetDir := filepath.Join(filepath.Dir(exe), "rocm")
  133. if rocmLibUsable(rocmTargetDir) {
  134. slog.Debug("detected ROCM next to ollama executable " + rocmTargetDir)
  135. return rocmTargetDir, nil
  136. }
  137. }
  138. // If we already have a rocm dependency wired, nothing more to do
  139. libDir, err := AssetsDir()
  140. if err != nil {
  141. return "", fmt.Errorf("unable to lookup lib dir: %w", err)
  142. }
  143. rocmTargetDir := filepath.Join(libDir, "rocm")
  144. if rocmLibUsable(rocmTargetDir) {
  145. return rocmTargetDir, nil
  146. }
  147. // Prefer explicit HIP env var
  148. hipPath := os.Getenv("HIP_PATH")
  149. if hipPath != "" {
  150. hipLibDir := filepath.Join(hipPath, "bin")
  151. if rocmLibUsable(hipLibDir) {
  152. slog.Debug("detected ROCM via HIP_PATH=" + hipPath)
  153. return hipLibDir, nil
  154. }
  155. }
  156. // Well known location(s)
  157. if rocmLibUsable(RocmStandardLocation) {
  158. return RocmStandardLocation, nil
  159. }
  160. // Installer payload (if we're running from some other location)
  161. localAppData := os.Getenv("LOCALAPPDATA")
  162. appDir := filepath.Join(localAppData, "Programs", "Ollama")
  163. rocmTargetDir = filepath.Join(appDir, "rocm")
  164. if rocmLibUsable(rocmTargetDir) {
  165. slog.Debug("detected ollama installed ROCm at " + rocmTargetDir)
  166. return rocmTargetDir, nil
  167. }
  168. // Should not happen on windows since we include it in the installer, but stand-alone binary might hit this
  169. slog.Warn("amdgpu detected, but no compatible rocm library found. Please install ROCm v6")
  170. return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
  171. }