Просмотр исходного кода

nvidia libs have inconsistent ordering (#7473)

The runtime and management libraries may not always have
identical ordering, so use the device UUID to correlate instead of ID.
Daniel Hiltgen 6 месяцев назад
Родитель
Сommit
29ab9fa7d7
3 измененных файлов с 13 добавлено и 9 удалено
  1. 6 2
      discover/gpu.go
  2. 5 5
      discover/gpu_info_nvml.c
  3. 2 2
      discover/gpu_info_nvml.h

+ 6 - 2
discover/gpu.go

@@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
 				// query the management library as well so we can record any skew between the two
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
 				if cHandles.nvml != nil {
-					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
+					uuid := C.CString(gpuInfo.ID)
+					defer C.free(unsafe.Pointer(uuid))
+					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 					if memInfo.err != nil {
 					if memInfo.err != nil {
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
 						C.free(unsafe.Pointer(memInfo.err))
@@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
 		}
 		}
 		for i, gpu := range cudaGPUs {
 		for i, gpu := range cudaGPUs {
 			if cHandles.nvml != nil {
 			if cHandles.nvml != nil {
-				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
+				uuid := C.CString(gpu.ID)
+				defer C.free(unsafe.Pointer(uuid))
+				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
 			} else if cHandles.cudart != nil {
 			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
 			} else if cHandles.nvcuda != nil {
 			} else if cHandles.nvcuda != nil {

+ 5 - 5
discover/gpu_info_nvml.c

@@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
   } l[] = {
   } l[] = {
       {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
       {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
       {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
       {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
       {NULL, NULL},
       {NULL, NULL},
   };
   };
@@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 }
 }
 
 
 
 
-void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
     nvmlDevice_t device;
     nvmlDevice_t device;
     nvmlMemory_t memInfo = {0};
     nvmlMemory_t memInfo = {0};
     nvmlReturn_t ret;
     nvmlReturn_t ret;
-    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
     if (ret != NVML_SUCCESS) {
     if (ret != NVML_SUCCESS) {
-        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        LOG(1, "unable to get device handle %s: %d", uuid, ret);
         *free = 0;
         *free = 0;
         return;
         return;
     }
     }
 
 
     ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
     ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
     if (ret != NVML_SUCCESS) {
     if (ret != NVML_SUCCESS) {
-        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
         *free = 0;
         *free = 0;
         return;
         return;
     }
     }

+ 2 - 2
discover/gpu_info_nvml.h

@@ -25,7 +25,7 @@ typedef struct nvml_handle {
   uint16_t verbose;
   uint16_t verbose;
   nvmlReturn_t (*nvmlInit_v2)(void);
   nvmlReturn_t (*nvmlInit_v2)(void);
   nvmlReturn_t (*nvmlShutdown)(void);
   nvmlReturn_t (*nvmlShutdown)(void);
-  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
   nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
   nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
 } nvml_handle_t;
 } nvml_handle_t;
 
 
@@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
 } nvml_compute_capability_t;
 } nvml_compute_capability_t;
 
 
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
-void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
 void nvml_release(nvml_handle_t ch);
 void nvml_release(nvml_handle_t ch);
 
 
 #endif  // __GPU_INFO_NVML_H__
 #endif  // __GPU_INFO_NVML_H__