فهرست منبع

More logging for gpu management

Fix an ordering glitch of dlerr/dlclose and add more logging to help
root cause some crashes users are hitting. This also refines the
function pointer names to use the underlying function names instead
of simplified names for readability.
Daniel Hiltgen 1 سال پیش
والد
کامیت
013fd07139
5فایلهای تغییر یافته به همراه61 افزوده شده و 44 حذف شده
  1. 3 1
      gpu/gpu.go
  2. 26 18
      gpu/gpu_info_cuda.c
  3. 6 6
      gpu/gpu_info_cuda.h
  4. 21 14
      gpu/gpu_info_rocm.c
  5. 5 5
      gpu/gpu_info_rocm.h

+ 3 - 1
gpu/gpu.go

@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
 	"/usr/lib/wsl/lib/libnvidia-ml.so*",
 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
 	"/opt/cuda/lib64/libnvidia-ml.so*",
-	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 	"/usr/lib*/libnvidia-ml.so*",
 	"/usr/local/lib*/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
+
+	// TODO: are these stubs ever valid?
+	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
 }
 
 var CudaWindowsGlobs = []string{

+ 26 - 18
gpu/gpu_info_cuda.c

@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define CUDA_LOOKUP_SIZE 12
-
 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   nvmlReturn_t ret;
   resp->err = NULL;
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[CUDA_LOOKUP_SIZE] = {
-      {"nvmlInit_v2", (void *)&resp->ch.initFn},
-      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
-      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
-      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
       {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
       {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
       {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
       {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
       {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
       {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
+      {NULL, NULL},
   };
 
   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
   if (!resp->ch.handle) {
     char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
     snprintf(buf, buflen,
              "Unable to load %s library to query for Nvidia GPUs: %s",
              cuda_lib_path, msg);
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
+  
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->ch.handle);
       resp->ch.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->ch.initFn)();
+  ret = (*resp->ch.nvmlInit_v2)();
   if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
     UNLOAD_LIBRARY(resp->ch.handle);
     resp->ch.handle = NULL;
     snprintf(buf, buflen, "nvml vram init failure: %d", ret);
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
     return;
   }
 
-  ret = (*h.getCount)(&resp->count);
+  ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
   resp->total = 0;
   resp->free = 0;
   for (i = 0; i < resp->count; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getMemInfo)(device, &memInfo);
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   unsigned int devices;
-  ret = (*h.getCount)(&devices);
+  ret = (*h.nvmlDeviceGetCount_v2)(&devices);
   if (ret != NVML_SUCCESS) {
     snprintf(buf, buflen, "unable to get device count: %d", ret);
     resp->err = strdup(buf);
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
   }
 
   for (i = 0; i < devices; i++) {
-    ret = (*h.getHandle)(i, &device);
+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
       resp->err = strdup(buf);
       return;
     }
 
-    ret = (*h.getComputeCapability)(device, &major, &minor);
+    ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
     if (ret != NVML_SUCCESS) {
       snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
       resp->err = strdup(buf);

+ 6 - 6
gpu/gpu_info_cuda.h

@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
 typedef struct cuda_handle {
   void *handle;
   uint16_t verbose;
-  nvmlReturn_t (*initFn)(void);
-  nvmlReturn_t (*shutdownFn)(void);
-  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
-  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
-  nvmlReturn_t (*getCount)(unsigned int *);
-  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+  nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
+  nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
   nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
   nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);

+ 21 - 14
gpu/gpu_info_rocm.c

@@ -4,8 +4,6 @@
 
 #include <string.h>
 
-#define ROCM_LOOKUP_SIZE 14
-
 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   rsmi_status_t ret;
   resp->err = NULL;
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
   struct lookup {
     char *s;
     void **p;
-  } l[ROCM_LOOKUP_SIZE] = {
-      {"rsmi_init", (void *)&resp->rh.initFn},
-      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
-      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
+  } l[] = {
+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
       {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
       {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
       {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
       {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
       {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
       {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
+      {NULL, NULL},
   };
 
   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     return;
   }
 
-  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
+
     *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
     if (!l[i].p) {
-      UNLOAD_LIBRARY(resp->rh.handle);
       resp->rh.handle = NULL;
       char *msg = LOAD_ERR();
+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->rh.handle);
       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
                msg);
       free(msg);
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
     }
   }
 
-  ret = (*resp->rh.initFn)(0);
+  ret = (*resp->rh.rsmi_init)(0);
   if (ret != RSMI_STATUS_SUCCESS) {
+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
     UNLOAD_LIBRARY(resp->rh.handle);
     resp->rh.handle = NULL;
     snprintf(buf, buflen, "rocm vram init failure: %d", ret);
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
     }
 
     // Get total memory - used memory for available memory
-    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
       resp->err = strdup(buf);
       return;
     }
-    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
     if (ret != RSMI_STATUS_SUCCESS) {
       snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
       resp->err = strdup(buf);
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
   }
   rsmi_version_t ver;
   rsmi_status_t ret;
-  ret = h.versionGetFn(&ver);
+  ret = h.rsmi_version_get(&ver);
   if (ret != RSMI_STATUS_SUCCESS) {
     snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
     resp->status = 1;

+ 5 - 5
gpu/gpu_info_rocm.h

@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
 typedef struct rocm_handle {
   void *handle;
   uint16_t verbose;
-  rsmi_status_t (*initFn)(uint64_t);
-  rsmi_status_t (*shutdownFn)(void);
-  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
-  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
+  rsmi_status_t (*rsmi_init)(uint64_t);
+  rsmi_status_t (*rsmi_shut_down)(void);
+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
   rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
   rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
   rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);