6 месяцев назад · 29ab9fa7d7
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -316,7 +316,9 @@ func GetGPUInfo() GpuInfoList {
 
															 				// query the management library as well so we can record any skew between the two
														
 
															 				// which represents overhead on the GPU we must set aside on subsequent updates
														
 
															 				if cHandles.nvml != nil {
														
 
															-					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
														
 
															+					uuid := C.CString(gpuInfo.ID)
														
 
															+					defer C.free(unsafe.Pointer(uuid))
														
 
															+					C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
														
 
															 					if memInfo.err != nil {
														
 
															 						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
														
 
															 						C.free(unsafe.Pointer(memInfo.err))
														
@@ -417,7 +419,9 @@ func GetGPUInfo() GpuInfoList {
 
															 		}
														
 
															 		for i, gpu := range cudaGPUs {
														
 
															 			if cHandles.nvml != nil {
														
 
															-				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
														
 
															+				uuid := C.CString(gpu.ID)
														
 
															+				defer C.free(unsafe.Pointer(uuid))
														
 
															+				C.nvml_get_free(*cHandles.nvml, uuid, &memInfo.free, &memInfo.total, &memInfo.used)
														
 
															 			} else if cHandles.cudart != nil {
														
 
															 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
														
 
															 			} else if cHandles.nvcuda != nil {
														
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
@@ -17,7 +17,7 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 
															   } l[] = {
														
 
															       {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
														
 
															       {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
														
 
															-      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
														
 
															+      {"nvmlDeviceGetHandleByUUID", (void *)&resp->ch.nvmlDeviceGetHandleByUUID},
														
 
															       {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
														
 
															       {NULL, NULL},
														
 
															   };
														
@@ -67,20 +67,20 @@ void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
 
															 }
														
 
															-void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
														
 
															+void nvml_get_free(nvml_handle_t h, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used) {
														
 
															     nvmlDevice_t device;
														
 
															     nvmlMemory_t memInfo = {0};
														
 
															     nvmlReturn_t ret;
														
 
															-    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
														
 
															+    ret = (*h.nvmlDeviceGetHandleByUUID)((const char *)(uuid), &device);
														
 
															     if (ret != NVML_SUCCESS) {
														
 
															-        LOG(1, "unable to get device handle %d: %d", device_id, ret);
														
 
															+        LOG(1, "unable to get device handle %s: %d", uuid, ret);
														
 
															         *free = 0;
														
 
															         return;
														
 
															     }
														
 
															     ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
														
 
															     if (ret != NVML_SUCCESS) {
														
 
															-        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
														
 
															+        LOG(1, "device memory info lookup failure %s: %d", uuid, ret);
														
 
															         *free = 0;
														
 
															         return;
														
 
															     }
														
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
@@ -25,7 +25,7 @@ typedef struct nvml_handle {
 
															   uint16_t verbose;
														
 
															   nvmlReturn_t (*nvmlInit_v2)(void);
														
 
															   nvmlReturn_t (*nvmlShutdown)(void);
														
 
															-  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
														
 
															+  nvmlReturn_t (*nvmlDeviceGetHandleByUUID)(const char *, nvmlDevice_t *);
														
 
															   nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
														
 
															 } nvml_handle_t;
														
@@ -41,7 +41,7 @@ typedef struct nvml_compute_capability {
 
															 } nvml_compute_capability_t;
														
 
															 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
														
 
															-void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
														
 
															+void nvml_get_free(nvml_handle_t ch, char *uuid, uint64_t *free, uint64_t *total, uint64_t *used);
														
 
															 void nvml_release(nvml_handle_t ch);
														
 
															 #endif  // __GPU_INFO_NVML_H__