Browse Source

Merge pull request #2162 from dhiltgen/rocm_real_gpus

Report more information about GPUs in verbose mode
Daniel Hiltgen 1 năm trước cách đây
mục cha
commit
f63dc2db5c
6 tập tin đã thay đổi với 171 bổ sung30 xóa
  1. 9 0
      gpu/gpu.go
  2. 7 0
      gpu/gpu_info.h
  3. 52 2
      gpu/gpu_info_cuda.c
  4. 12 0
      gpu/gpu_info_cuda.h
  5. 81 27
      gpu/gpu_info_rocm.c
  6. 10 1
      gpu/gpu_info_rocm.h

+ 9 - 0
gpu/gpu.go

@@ -259,6 +259,7 @@ func FindGPULibs(baseLibName string, patterns []string) []string {
 
 func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 	var resp C.cuda_init_resp_t
+	resp.ch.verbose = getVerboseState()
 	for _, libPath := range cudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
@@ -275,6 +276,7 @@ func LoadCUDAMgmt(cudaLibPaths []string) *C.cuda_handle_t {
 
 func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	var resp C.rocm_init_resp_t
+	resp.rh.verbose = getVerboseState()
 	for _, libPath := range rocmLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
@@ -288,3 +290,10 @@ func LoadROCMMgmt(rocmLibPaths []string) *C.rocm_handle_t {
 	}
 	return nil
 }
+
+func getVerboseState() C.uint16_t {
+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
+		return C.uint16_t(1)
+	}
+	return C.uint16_t(0)
+}

+ 7 - 0
gpu/gpu_info.h

@@ -27,6 +27,13 @@
 
 #endif
 
+#define LOG(verbose, ...) \
+  do { \
+    if (verbose) { \
+      fprintf(stderr, __VA_ARGS__); \
+    } \
+  } while (0)
+
 #ifdef __cplusplus
 extern "C" {
 #endif

+ 52 - 2
gpu/gpu_info_cuda.c

@@ -4,7 +4,7 @@
 
 #include <string.h>
 
-#define CUDA_LOOKUP_SIZE 6
+#define CUDA_LOOKUP_SIZE 12
 
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
@@ -23,6 +23,12 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
       {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
       {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+      {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
+      {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
+      {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
+      {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
+      {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
+      {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
   };
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
@@ -58,7 +64,13 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     resp->err = strdup(buf);
   }
 
-  return;
+  // Report driver version if we're in verbose mode, ignore errors
+  ret = (*resp->ch.nvmlSystemGetDriverVersion)(buf, buflen);
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlSystemGetDriverVersion failed: %d\n", ret);
+  } else {
+    LOG(resp->ch.verbose, "CUDA driver version: %s\n", buf);
+  }
 }
 
 void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
@@ -98,6 +110,44 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
       resp->err = strdup(buf);
       return;
     }
+    if (h.verbose) {
+      nvmlBrandType_t brand = 0;
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.nvmlDeviceGetName)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetName failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA device name: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBoardPartNumber)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBoardPartNumber failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA part number: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetSerial)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetSerial failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA S/N: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetVbiosVersion)(device, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetVbiosVersion failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA vbios version: %s\n", i, buf);
+      }
+      ret = (*h.nvmlDeviceGetBrand)(device, &brand);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "nvmlDeviceGetBrand failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] CUDA brand: %d\n", i, brand);
+      }
+    }
+
+    LOG(h.verbose, "[%d] CUDA totalMem %ld\n", i, memInfo.total);
+    LOG(h.verbose, "[%d] CUDA usedMem %ld\n", i, memInfo.free);
 
     resp->total += memInfo.total;
     resp->free += memInfo.free;

+ 12 - 0
gpu/gpu_info_cuda.h

@@ -15,14 +15,26 @@ typedef struct nvmlMemory_st {
   unsigned long long used;
 } nvmlMemory_t;
 
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
 typedef struct cuda_handle {
   void *handle;
+  uint16_t verbose;
   nvmlReturn_t (*initFn)(void);
   nvmlReturn_t (*shutdownFn)(void);
   nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
   nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*getCount)(unsigned int *);
   nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetVbiosVersion) (nvmlDevice_t device, char* version, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBoardPartNumber) (nvmlDevice_t device, char* partNumber, unsigned int  length);
+  nvmlReturn_t (*nvmlDeviceGetBrand) (nvmlDevice_t device, nvmlBrandType_t* type);
 } cuda_handle_t;
 
 typedef struct cuda_init_resp {

+ 81 - 27
gpu/gpu_info_rocm.c

@@ -4,7 +4,7 @@
 
 #include <string.h>
 
-#define ROCM_LOOKUP_SIZE 5
+#define ROCM_LOOKUP_SIZE 14
 
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
@@ -21,7 +21,15 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
       {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
       {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
-      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
+      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
+      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
+      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
+      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
+      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
+      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
+      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
+      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
+      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
   };
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -62,8 +70,6 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
 void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
   resp->err = NULL;
-  // uint32_t num_devices;
-  // uint16_t device;
   uint64_t totalMem = 0;
   uint64_t usedMem = 0;
   rsmi_status_t ret;
@@ -76,34 +82,82 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  // TODO - iterate through devices...  ret =
-  // rsmi_num_monitor_devices(&num_devices);
-
-  // ret = (*h.getHandle)(0, &device);
-  // if (ret != RSMI_STATUS_SUCCESS) {
-  //     printf("rocm vram device lookup failure: %d\n", ret);
-  //     return -1;
-  // }
-
-  // Get total memory - used memory for available memory
-  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
-  if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
-    resp->err = strdup(buf);
-    return;
-  }
-  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
+  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
   if (ret != RSMI_STATUS_SUCCESS) {
-    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+    snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
     return;
   }
+  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);
+
+  resp->total = 0;
+  resp->free = 0;
+  for (i = 0; i < resp->count; i++) {
+    if (h.verbose) {
+      // When in verbose mode, report more information about
+      // the card we discover, but don't fail on error
+      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
+      }
+      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
+      if (ret != RSMI_STATUS_SUCCESS) {
+        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
+      } else {
+        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
+      }
+    }
 
-  // TODO: set this to the actual number of devices
-  resp->count = 1;
-  resp->total = totalMem;
-  resp->free = totalMem - usedMem;
-  return;
+    // Get total memory - used memory for available memory
+    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    if (ret != RSMI_STATUS_SUCCESS) {
+      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
+      resp->err = strdup(buf);
+      return;
+    }
+    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
+    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
+    resp->total += totalMem;
+    resp->free += totalMem - usedMem;
+  }
 }
 
 void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {

+ 10 - 1
gpu/gpu_info_rocm.h

@@ -24,12 +24,21 @@ typedef enum rsmi_memory_type {
 
 typedef struct rocm_handle {
   void *handle;
+  uint16_t verbose;
   rsmi_status_t (*initFn)(uint64_t);
   rsmi_status_t (*shutdownFn)(void);
   rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
   rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
-  // rsmi_status_t (*getHandle)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
+  rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
+  rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);
+  rsmi_status_t (*rsmi_dev_brand_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vendor_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vram_vendor_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_serial_number_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_subsystem_name_get) (uint32_t, char *, uint32_t);		
+  rsmi_status_t (*rsmi_dev_vbios_version_get) (uint32_t, char *, uint32_t);		
 } rocm_handle_t;
 
 typedef struct rocm_init_resp {