소스 검색

Merge pull request #2465 from dhiltgen/block_rocm_pre_9

Detect AMD GPU info via sysfs and block old cards
Daniel Hiltgen 1 년 전
부모
커밋
76b8728f0c
5개의 변경된 파일151개의 추가작업 그리고 34개의 파일을 삭제
  1. 91 0
      gpu/amd.go
  2. 53 33
      gpu/gpu.go
  3. 6 0
      gpu/types.go
  4. 0 1
      llm/generate/gen_linux.sh
  5. 1 0
      llm/payload_common.go

+ 91 - 0
gpu/amd.go

@@ -0,0 +1,91 @@
+package gpu
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// TODO - windows vs. non-windows vs darwin
+
+// Discovery logic for AMD/ROCm GPUs
+
+const (
+	DriverVersionFile     = "/sys/module/amdgpu/version"
+	GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
+	// TODO probably break these down per GPU to make the logic simpler
+	GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
+	GPUUsedMemoryFileGlob  = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
+)
+
+func AMDDetected() bool {
+	_, err := AMDDriverVersion()
+	return err == nil
+}
+
+func AMDDriverVersion() (string, error) {
+	_, err := os.Stat(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	fp, err := os.Open(DriverVersionFile)
+	if err != nil {
+		return "", err
+	}
+	defer fp.Close()
+	verString, err := io.ReadAll(fp)
+	if err != nil {
+		return "", err
+	}
+	return strings.TrimSpace(string(verString)), nil
+}
+
+func AMDGFXVersions() []Version {
+	res := []Version{}
+	matches, _ := filepath.Glob(GPUPropertiesFileGlob)
+	for _, match := range matches {
+		fp, err := os.Open(match)
+		if err != nil {
+			slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
+			continue
+		}
+		defer fp.Close()
+
+		scanner := bufio.NewScanner(fp)
+		// optionally, resize scanner's capacity for lines over 64K, see next example
+		for scanner.Scan() {
+			line := strings.TrimSpace(scanner.Text())
+			if strings.HasPrefix(line, "gfx_target_version") {
+				ver := strings.Fields(line)
+				if len(ver) != 2 || len(ver[1]) < 5 {
+					slog.Debug("malformed " + line)
+					continue
+				}
+				l := len(ver[1])
+				patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
+				minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
+				major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
+				if err1 != nil || err2 != nil || err3 != nil {
+					slog.Debug("malformed int " + line)
+					continue
+				}
+
+				res = append(res, Version{
+					Major: uint(major),
+					Minor: uint(minor),
+					Patch: uint(patch),
+				})
+			}
+		}
+	}
+	return res
+}
+
+func (v Version) ToGFXString() string {
+	return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
+}

+ 53 - 33
gpu/gpu.go

@@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo {
 				slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
 		}
-	} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
-		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
-		if memInfo.err != nil {
-			slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
-			C.free(unsafe.Pointer(memInfo.err))
-		} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
-			// Only one GPU detected and it appears to be an integrated GPU - skip it
-			slog.Info("ROCm unsupported integrated GPU detected")
-		} else if memInfo.count > 0 {
-			if memInfo.igpu_index >= 0 {
-				// We have multiple GPUs reported, and one of them is an integrated GPU
-				// so we have to set the env var to bypass it
-				// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
-				val := os.Getenv("ROCR_VISIBLE_DEVICES")
-				if val == "" {
-					devices := []string{}
-					for i := 0; i < int(memInfo.count); i++ {
-						if i == int(memInfo.igpu_index) {
-							continue
+	} else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
+		ver, err := AMDDriverVersion()
+		if err == nil {
+			slog.Info("AMD Driver: " + ver)
+		}
+		gfx := AMDGFXVersions()
+		tooOld := false
+		for _, v := range gfx {
+			if v.Major < 9 {
+				slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
+				tooOld = true
+				break
+			}
+
+			// TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
+			// e.g. gfx1034 works if we map it to gfx1030 at runtime
+
+		}
+		if !tooOld {
+			// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
+			C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
+			if memInfo.err != nil {
+				slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
+				C.free(unsafe.Pointer(memInfo.err))
+			} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
+				// Only one GPU detected and it appears to be an integrated GPU - skip it
+				slog.Info("ROCm unsupported integrated GPU detected")
+			} else if memInfo.count > 0 {
+				if memInfo.igpu_index >= 0 {
+					// We have multiple GPUs reported, and one of them is an integrated GPU
+					// so we have to set the env var to bypass it
+					// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
+					val := os.Getenv("ROCR_VISIBLE_DEVICES")
+					if val == "" {
+						devices := []string{}
+						for i := 0; i < int(memInfo.count); i++ {
+							if i == int(memInfo.igpu_index) {
+								continue
+							}
+							devices = append(devices, strconv.Itoa(i))
 						}
-						devices = append(devices, strconv.Itoa(i))
+						val = strings.Join(devices, ",")
+						os.Setenv("ROCR_VISIBLE_DEVICES", val)
 					}
-					val = strings.Join(devices, ",")
-					os.Setenv("ROCR_VISIBLE_DEVICES", val)
+					slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
 				}
-				slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
-			}
-			resp.Library = "rocm"
-			var version C.rocm_version_resp_t
-			C.rocm_get_version(*gpuHandles.rocm, &version)
-			verString := C.GoString(version.str)
-			if version.status == 0 {
-				resp.Variant = "v" + verString
-			} else {
-				slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+				resp.Library = "rocm"
+				var version C.rocm_version_resp_t
+				C.rocm_get_version(*gpuHandles.rocm, &version)
+				verString := C.GoString(version.str)
+				if version.status == 0 {
+					resp.Variant = "v" + verString
+				} else {
+					slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
+				}
+				C.free(unsafe.Pointer(version.str))
 			}
-			C.free(unsafe.Pointer(version.str))
 		}
 	}
 	if resp.Library == "" {

+ 6 - 0
gpu/types.go

@@ -16,3 +16,9 @@ type GpuInfo struct {
 
 	// TODO add other useful attributes about the card here for discovery information
 }
+
+type Version struct {
+	Major uint
+	Minor uint
+	Patch uint
+}

+ 0 - 1
llm/generate/gen_linux.sh

@@ -21,7 +21,6 @@ amdGPUs() {
         return
     fi
     GPU_LIST=(
-        "gfx803"
         "gfx900"
         "gfx906:xnack-"
         "gfx908:xnack-"

+ 1 - 0
llm/payload_common.go

@@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 	if len(dynLibs) == 0 {
 		dynLibs = []string{availableDynLibs["cpu"]}
 	}
+	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
 	return dynLibs
 }