Kaynağa Gözat

Merge pull request #2174 from dhiltgen/rocm_real_gpus

More logging for gpu management
Daniel Hiltgen 1 yıl önce
ebeveyn
işleme
a170888dd4
5 değiştirilmiş dosya ile 61 ekleme ve 44 silme
  1. 3 1
      gpu/gpu.go
  2. 26 18
      gpu/gpu_info_cuda.c
  3. 6 6
      gpu/gpu_info_cuda.h
  4. 21 14
      gpu/gpu_info_rocm.c
  5. 5 5
      gpu/gpu_info_rocm.h

+ 3 - 1
gpu/gpu.go

@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
 	"/usr/lib/wsl/lib/libnvidia-ml.so*",
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
-	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
+
+	// TODO: are these stubs ever valid?
+	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }
 
 var CudaWindowsGlobs = []string{

+ 26 - 18
gpu/gpu_info_cuda.c

@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define CUDA_LOOKUP_SIZE 12
-
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
   resp->err = NULL;
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[CUDA_LOOKUP_SIZE] = {
-      {"nvmlInit_v2", (void *)&resp->ch.initFn},
-      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
-      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
-      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
       {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
       {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
       {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
       {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
       {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
       {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
+      {NULL, NULL},
   };
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
   if (!resp->ch.handle) {
     char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
     snprintf(buf, buflen,
              "Unable to load %s library to query for Nvidia GPUs: %s",
              cuda_lib_path, msg);
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->ch.handle);
       resp->ch.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->ch.initFn)();
+  ret = (*resp->ch.nvmlInit_v2)();
   if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
     UNLOAD_LIBRARY(resp->ch.handle);
     resp->ch.handle = NULL;
     snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  ret = (*h.getCount)(&resp->count);
+  ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
   resp->total = 0;
   resp->free = 0;
   for (i = 0; i < resp->count; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getMemInfo)(device, &memInfo);
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   unsigned int devices;
-  ret = (*h.getCount)(&devices);
+  ret = (*h.nvmlDeviceGetCount_v2)(&devices);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   for (i = 0; i < devices; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getComputeCapability)(device, &major, &minor);
+    ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);

+ 6 - 6
gpu/gpu_info_cuda.h

@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
 typedef struct cuda_handle {
   void *handle;
   uint16_t verbose;
-  nvmlReturn_t (*initFn)(void);
-  nvmlReturn_t (*shutdownFn)(void);
-  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
-  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
-  nvmlReturn_t (*getCount)(unsigned int *);
-  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+  nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
+  nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
   nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);

+ 21 - 14
gpu/gpu_info_rocm.c

@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define ROCM_LOOKUP_SIZE 14
-
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
   resp->err = NULL;
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[ROCM_LOOKUP_SIZE] = {
-      {"rsmi_init", (void *)&resp->rh.initFn},
-      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
-      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
       {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
       {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
       {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
       {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
       {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
   };
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->rh.handle);
       resp->rh.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->rh.initFn)(0);
+  ret = (*resp->rh.rsmi_init)(0);
   if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
     UNLOAD_LIBRARY(resp->rh.handle);
     resp->rh.handle = NULL;
     snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     }
 
     // Get total memory - used memory for available memory
-    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
       resp->err = strdup(buf);
       return;
     }
-    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
       resp->err = strdup(buf);
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
   }
   rsmi_version_t ver;
   rsmi_status_t ret;
-  ret = h.versionGetFn(&ver);
+  ret = h.rsmi_version_get(&ver);
   if (ret != RSMI_STATUS_SUCCESS) {
     snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
     resp->status = 1;

+ 5 - 5
gpu/gpu_info_rocm.h

@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
 typedef struct rocm_handle {
   void *handle;
   uint16_t verbose;
-  rsmi_status_t (*initFn)(uint64_t);
-  rsmi_status_t (*shutdownFn)(void);
-  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
   rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
   rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
   rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);