1 سال پیش · 013fd07139
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
 
				 	"/usr/lib/wsl/lib/libnvidia-ml.so*",
			
 
				 	"/usr/lib/wsl/drivers/*/libnvidia-ml.so*",
			
 
				 	"/opt/cuda/lib64/libnvidia-ml.so*",
			
 
				-	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
			
 
				 	"/usr/lib*/libnvidia-ml.so*",
			
 
				 	"/usr/local/lib*/libnvidia-ml.so*",
			
 
				 	"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*",
			
 
				 	"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*",
			
 
				+
			
 
				+	// TODO: are these stubs ever valid?
			
 
				+	"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*",
			
 
				 }
			
 
				 
			
 
				 var CudaWindowsGlobs = []string{
			
--- a/gpu/gpu_info_cuda.c
+++ b/gpu/gpu_info_cuda.c
@@ -4,8 +4,6 @@
 
				 
			
 
				 #include <string.h>
			
 
				 
			
 
				-#define CUDA_LOOKUP_SIZE 12
			
 
				-
			
 
				 void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
			
 
				   nvmlReturn_t ret;
			
 
				   resp->err = NULL;
			
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				   struct lookup {
			
 
				     char *s;
			
 
				     void **p;
			
 
				-  } l[CUDA_LOOKUP_SIZE] = {
			
 
				-      {"nvmlInit_v2", (void *)&resp->ch.initFn},
			
 
				-      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
			
 
				-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
			
 
				-      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
			
 
				-      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
			
 
				-      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
			
 
				+  } l[] = {
			
 
				+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
			
 
				+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
			
 
				+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
			
 
				+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
			
 
				+      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.nvmlDeviceGetCount_v2},
			
 
				+      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.nvmlDeviceGetCudaComputeCapability},
			
 
				       {"nvmlSystemGetDriverVersion", (void *)&resp->ch.nvmlSystemGetDriverVersion},
			
 
				       {"nvmlDeviceGetName", (void *)&resp->ch.nvmlDeviceGetName},
			
 
				       {"nvmlDeviceGetSerial", (void *)&resp->ch.nvmlDeviceGetSerial},
			
 
				       {"nvmlDeviceGetVbiosVersion", (void *)&resp->ch.nvmlDeviceGetVbiosVersion},
			
 
				       {"nvmlDeviceGetBoardPartNumber", (void *)&resp->ch.nvmlDeviceGetBoardPartNumber},
			
 
				       {"nvmlDeviceGetBrand", (void *)&resp->ch.nvmlDeviceGetBrand},
			
 
				+      {NULL, NULL},
			
 
				   };
			
 
				 
			
 
				   resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
			
 
				   if (!resp->ch.handle) {
			
 
				     char *msg = LOAD_ERR();
			
 
				+    LOG(resp->ch.verbose, "library %s load err: %s\n", cuda_lib_path, msg);
			
 
				     snprintf(buf, buflen,
			
 
				              "Unable to load %s library to query for Nvidia GPUs: %s",
			
 
				              cuda_lib_path, msg);
			
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
			
 
				+  // TODO once we've squashed the remaining corner cases remove this log
			
 
				+  LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", cuda_lib_path);
			
 
				+  
			
 
				+  for (i = 0; l[i].s != NULL; i++) {
			
 
				+    // TODO once we've squashed the remaining corner cases remove this log
			
 
				+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
			
 
				+
			
 
				     *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
			
 
				     if (!l[i].p) {
			
 
				-      UNLOAD_LIBRARY(resp->ch.handle);
			
 
				       resp->ch.handle = NULL;
			
 
				       char *msg = LOAD_ERR();
			
 
				+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
			
 
				+      UNLOAD_LIBRARY(resp->ch.handle);
			
 
				       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
			
 
				                msg);
			
 
				       free(msg);
			
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  ret = (*resp->ch.initFn)();
			
 
				+  ret = (*resp->ch.nvmlInit_v2)();
			
 
				   if (ret != NVML_SUCCESS) {
			
 
				+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
			
 
				     UNLOAD_LIBRARY(resp->ch.handle);
			
 
				     resp->ch.handle = NULL;
			
 
				     snprintf(buf, buflen, "nvml vram init failure: %d", ret);
			
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  ret = (*h.getCount)(&resp->count);
			
 
				+  ret = (*h.nvmlDeviceGetCount_v2)(&resp->count);
			
 
				   if (ret != NVML_SUCCESS) {
			
 
				     snprintf(buf, buflen, "unable to get device count: %d", ret);
			
 
				     resp->err = strdup(buf);
			
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
 
				   resp->total = 0;
			
 
				   resp->free = 0;
			
 
				   for (i = 0; i < resp->count; i++) {
			
 
				-    ret = (*h.getHandle)(i, &device);
			
 
				+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
			
 
				     if (ret != NVML_SUCCESS) {
			
 
				       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
			
 
				       resp->err = strdup(buf);
			
 
				       return;
			
 
				     }
			
 
				 
			
 
				-    ret = (*h.getMemInfo)(device, &memInfo);
			
 
				+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
			
 
				     if (ret != NVML_SUCCESS) {
			
 
				       snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
			
 
				       resp->err = strdup(buf);
			
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
 
				   }
			
 
				 
			
 
				   unsigned int devices;
			
 
				-  ret = (*h.getCount)(&devices);
			
 
				+  ret = (*h.nvmlDeviceGetCount_v2)(&devices);
			
 
				   if (ret != NVML_SUCCESS) {
			
 
				     snprintf(buf, buflen, "unable to get device count: %d", ret);
			
 
				     resp->err = strdup(buf);
			
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
 
				   }
			
 
				 
			
 
				   for (i = 0; i < devices; i++) {
			
 
				-    ret = (*h.getHandle)(i, &device);
			
 
				+    ret = (*h.nvmlDeviceGetHandleByIndex)(i, &device);
			
 
				     if (ret != NVML_SUCCESS) {
			
 
				       snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
			
 
				       resp->err = strdup(buf);
			
 
				       return;
			
 
				     }
			
 
				 
			
 
				-    ret = (*h.getComputeCapability)(device, &major, &minor);
			
 
				+    ret = (*h.nvmlDeviceGetCudaComputeCapability)(device, &major, &minor);
			
 
				     if (ret != NVML_SUCCESS) {
			
 
				       snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
			
 
				       resp->err = strdup(buf);
			
--- a/gpu/gpu_info_cuda.h
+++ b/gpu/gpu_info_cuda.h
@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
 
				 typedef struct cuda_handle {
			
 
				   void *handle;
			
 
				   uint16_t verbose;
			
 
				-  nvmlReturn_t (*initFn)(void);
			
 
				-  nvmlReturn_t (*shutdownFn)(void);
			
 
				-  nvmlReturn_t (*getHandle)(unsigned int, nvmlDevice_t *);
			
 
				-  nvmlReturn_t (*getMemInfo)(nvmlDevice_t, nvmlMemory_t *);
			
 
				-  nvmlReturn_t (*getCount)(unsigned int *);
			
 
				-  nvmlReturn_t (*getComputeCapability)(nvmlDevice_t, int* major, int* minor);
			
 
				+  nvmlReturn_t (*nvmlInit_v2)(void);
			
 
				+  nvmlReturn_t (*nvmlShutdown)(void);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetCount_v2)(unsigned int *);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetCudaComputeCapability)(nvmlDevice_t, int* major, int* minor);
			
 
				   nvmlReturn_t (*nvmlSystemGetDriverVersion) (char* version, unsigned int  length);
			
 
				   nvmlReturn_t (*nvmlDeviceGetName) (nvmlDevice_t device, char* name, unsigned int  length);
			
 
				   nvmlReturn_t (*nvmlDeviceGetSerial) (nvmlDevice_t device, char* serial, unsigned int  length);
			
--- a/gpu/gpu_info_rocm.c
+++ b/gpu/gpu_info_rocm.c
@@ -4,8 +4,6 @@
 
				 
			
 
				 #include <string.h>
			
 
				 
			
 
				-#define ROCM_LOOKUP_SIZE 14
			
 
				-
			
 
				 void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
			
 
				   rsmi_status_t ret;
			
 
				   resp->err = NULL;
			
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				   struct lookup {
			
 
				     char *s;
			
 
				     void **p;
			
 
				-  } l[ROCM_LOOKUP_SIZE] = {
			
 
				-      {"rsmi_init", (void *)&resp->rh.initFn},
			
 
				-      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
			
 
				-      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
			
 
				-      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
			
 
				-      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
			
 
				+  } l[] = {
			
 
				+      {"rsmi_init", (void *)&resp->rh.rsmi_init},
			
 
				+      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
			
 
				+      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
			
 
				+      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
			
 
				+      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
			
 
				       {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
			
 
				       {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
			
 
				       {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
			
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				       {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
			
 
				       {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
			
 
				       {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
			
 
				+      {NULL, NULL},
			
 
				   };
			
 
				 
			
 
				   resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
			
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
			
 
				+  // TODO once we've squashed the remaining corner cases remove this log
			
 
				+  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);
			
 
				+
			
 
				+  for (i = 0; l[i].s != NULL; i++) {
			
 
				+    // TODO once we've squashed the remaining corner cases remove this log
			
 
				+    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);
			
 
				+
			
 
				     *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
			
 
				     if (!l[i].p) {
			
 
				-      UNLOAD_LIBRARY(resp->rh.handle);
			
 
				       resp->rh.handle = NULL;
			
 
				       char *msg = LOAD_ERR();
			
 
				+      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
			
 
				+      UNLOAD_LIBRARY(resp->rh.handle);
			
 
				       snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
			
 
				                msg);
			
 
				       free(msg);
			
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  ret = (*resp->rh.initFn)(0);
			
 
				+  ret = (*resp->rh.rsmi_init)(0);
			
 
				   if (ret != RSMI_STATUS_SUCCESS) {
			
 
				+    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
			
 
				     UNLOAD_LIBRARY(resp->rh.handle);
			
 
				     resp->rh.handle = NULL;
			
 
				     snprintf(buf, buflen, "rocm vram init failure: %d", ret);
			
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
 
				     }
			
 
				 
			
 
				     // Get total memory - used memory for available memory
			
 
				-    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
			
 
				+    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
			
 
				     if (ret != RSMI_STATUS_SUCCESS) {
			
 
				       snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
			
 
				       resp->err = strdup(buf);
			
 
				       return;
			
 
				     }
			
 
				-    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
			
 
				+    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
			
 
				     if (ret != RSMI_STATUS_SUCCESS) {
			
 
				       snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
			
 
				       resp->err = strdup(buf);
			
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
 
				   }
			
 
				   rsmi_version_t ver;
			
 
				   rsmi_status_t ret;
			
 
				-  ret = h.versionGetFn(&ver);
			
 
				+  ret = h.rsmi_version_get(&ver);
			
 
				   if (ret != RSMI_STATUS_SUCCESS) {
			
 
				     snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
			
 
				     resp->status = 1;
			
--- a/gpu/gpu_info_rocm.h
+++ b/gpu/gpu_info_rocm.h
@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
 
				 typedef struct rocm_handle {
			
 
				   void *handle;
			
 
				   uint16_t verbose;
			
 
				-  rsmi_status_t (*initFn)(uint64_t);
			
 
				-  rsmi_status_t (*shutdownFn)(void);
			
 
				-  rsmi_status_t (*totalMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				-  rsmi_status_t (*usageMemFn)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				-  rsmi_status_t (*versionGetFn) (rsmi_version_t *version);
			
 
				+  rsmi_status_t (*rsmi_init)(uint64_t);
			
 
				+  rsmi_status_t (*rsmi_shut_down)(void);
			
 
				+  rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				+  rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t *);
			
 
				+  rsmi_status_t (*rsmi_version_get) (rsmi_version_t *version);
			
 
				   rsmi_status_t (*rsmi_num_monitor_devices) (uint32_t *);
			
 
				   rsmi_status_t (*rsmi_dev_id_get)(uint32_t, uint16_t *);
			
 
				   rsmi_status_t (*rsmi_dev_name_get) (uint32_t,char *,size_t);