瀏覽代碼

Reintroduce nvidia nvml library for windows

This library will give us the most reliable free VRAM reporting on windows
to enable concurrent model scheduling.
Daniel Hiltgen 11 月之前
父節點
當前提交
434dfe30c5
共有 8 個文件被更改,包括 248 次插入9 次删除
  1. 76 3
      gpu/gpu.go
  2. 2 0
      gpu/gpu_info.h
  3. 2 0
      gpu/gpu_info_cudart.c
  4. 3 3
      gpu/gpu_info_nvcuda.c
  5. 1 1
      gpu/gpu_info_nvcuda.h
  6. 112 0
      gpu/gpu_info_nvml.c
  7. 48 0
      gpu/gpu_info_nvml.h
  8. 4 2
      server/sched.go

+ 76 - 3
gpu/gpu.go

@@ -28,6 +28,7 @@ type cudaHandles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
 	nvcuda      *C.nvcuda_handle_t
+	nvml        *C.nvml_handle_t
 }
 
 type oneapiHandles struct {
@@ -50,6 +51,7 @@ var (
 	nvcudaLibPath string
 	cudartLibPath string
 	oneapiLibPath string
+	nvmlLibPath   string
 	rocmGPUs      []RocmGPUInfo
 	oneapiGPUs    []OneapiGPUInfo
 )
@@ -81,6 +83,10 @@ var CudartWindowsGlobs = []string{
 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
 }
 
+var NvmlWindowsGlobs = []string{
+	"c:\\Windows\\System32\\nvml.dll",
+}
+
 var NvcudaLinuxGlobs = []string{
 	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
 	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
@@ -117,6 +123,10 @@ func initCudaHandles() *cudaHandles {
 
 	cHandles := &cudaHandles{}
 	// Short Circuit if we already know which library to use
+	if nvmlLibPath != "" {
+		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
+		return cHandles
+	}
 	if nvcudaLibPath != "" {
 		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
 		return cHandles
@@ -131,6 +141,8 @@ func initCudaHandles() *cudaHandles {
 	var cudartMgmtPatterns []string
 	var nvcudaMgmtName string
 	var nvcudaMgmtPatterns []string
+	var nvmlMgmtName string
+	var nvmlMgmtPatterns []string
 
 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -142,6 +154,12 @@ func initCudaHandles() *cudaHandles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "nvcuda.dll"
 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
+
+		// Use nvml to refresh free memory on windows only
+		nvmlMgmtName = "nvml.dll"
+		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
+		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
+
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@@ -152,10 +170,24 @@ func initCudaHandles() *cudaHandles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "libcuda.so*"
 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+
+		// nvml omitted on linux
 	default:
 		return cHandles
 	}
 
+	if len(nvmlMgmtPatterns) > 0 {
+		nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
+		if len(nvmlLibPaths) > 0 {
+			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
+			if nvml != nil {
+				slog.Debug("nvidia-ml loaded", "library", libPath)
+				cHandles.nvml = nvml
+				nvmlLibPath = libPath
+			}
+		}
+	}
+
 	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
@@ -230,6 +262,9 @@ func GetGPUInfo() GpuInfoList {
 			if cHandles.nvcuda != nil {
 				C.nvcuda_release(*cHandles.nvcuda)
 			}
+			if cHandles.nvml != nil {
+				C.nvml_release(*cHandles.nvml)
+			}
 		}
 		if oHandles != nil {
 			if oHandles.oneapi != nil {
@@ -365,10 +400,17 @@ func GetGPUInfo() GpuInfoList {
 			cHandles = initCudaHandles()
 		}
 		for i, gpu := range cudaGPUs {
-			if cHandles.cudart != nil {
+			if cHandles.nvml != nil {
+				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
+			} else if cHandles.cudart != nil {
 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
+			} else if cHandles.nvcuda != nil {
+				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
+				memInfo.used = memInfo.total - memInfo.free
 			} else {
-				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free)
+				// shouldn't happen
+				slog.Warn("no valid cuda library loaded to refresh vram usage")
+				break
 			}
 			if memInfo.err != nil {
 				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
@@ -379,7 +421,21 @@ func GetGPUInfo() GpuInfoList {
 				slog.Warn("error looking up nvidia GPU memory")
 				continue
 			}
-			slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free)))
+			slog.Debug("updating cuda memory data",
+				"gpu", gpu.ID,
+				"name", gpu.Name,
+				slog.Group(
+					"before",
+					"total", format.HumanBytes2(gpu.TotalMemory),
+					"free", format.HumanBytes2(gpu.FreeMemory),
+				),
+				slog.Group(
+					"now",
+					"total", format.HumanBytes2(uint64(memInfo.total)),
+					"free", format.HumanBytes2(uint64(memInfo.free)),
+					"used", format.HumanBytes2(uint64(memInfo.used)),
+				),
+			)
 			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
 		}
 
@@ -530,6 +586,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 	return 0, nil, ""
 }
 
+func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
+	var resp C.nvml_init_resp_t
+	resp.ch.verbose = getVerboseState()
+	for _, libPath := range nvmlLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.nvml_init(lib, &resp)
+		if resp.err != nil {
+			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return &resp.ch, libPath
+		}
+	}
+	return nil, ""
+}
+
 func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 	var resp C.oneapi_init_resp_t
 	num_devices := 0

+ 2 - 0
gpu/gpu_info.h

@@ -47,6 +47,7 @@ typedef struct mem_info {
   char gpu_name[GPU_NAME_LEN];
   uint64_t total;
   uint64_t free;
+  uint64_t used;
 
   // Compute Capability
   int major; 
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
 
 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
+#include "gpu_info_nvml.h"
 #include "gpu_info_oneapi.h"
 
 #endif  // __GPU_INFO_H__

+ 2 - 0
gpu/gpu_info_cudart.c

@@ -166,9 +166,11 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
 
   resp->total = memInfo.total;
   resp->free = memInfo.free;
+  resp->used = memInfo.used;
 
   LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
   LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
+  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
   LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
 }
 

+ 3 - 3
gpu/gpu_info_nvcuda.c

@@ -197,12 +197,12 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
   }
 }
 
-void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
   CUresult ret;
   CUcontext ctx = NULL;
   CUdevice device = -1;
   *free = 0;
-  uint64_t total = 0;
+  *total = 0;
 
   ret = (*h.cuDeviceGet)(&device, i);
   if (ret != CUDA_SUCCESS) {
@@ -218,7 +218,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
     return;
   }
 
-  ret = (*h.cuMemGetInfo_v2)(free, &total);
+  ret = (*h.cuMemGetInfo_v2)(free, total);
   if (ret != CUDA_SUCCESS) {
     LOG(1, "nvcuda device memory info lookup failure %d", ret);
     // Best effort on failure...

+ 1 - 1
gpu/gpu_info_nvcuda.h

@@ -68,7 +68,7 @@ typedef struct nvcuda_init_resp {
 
 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
 void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
-void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free);
+void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
 void nvcuda_release(nvcuda_handle_t ch);
 
 #endif  // __GPU_INFO_NVCUDA_H__

+ 112 - 0
gpu/gpu_info_nvml.c

@@ -0,0 +1,112 @@
+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
+
+#include <string.h>
+
+#include "gpu_info_nvml.h"
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
+  nvmlReturn_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+
+  LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
+
+  struct lookup {
+    char *s;
+    void **p;
+  } l[] = {
+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
+      {NULL, NULL},
+  };
+
+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
+  if (!resp->ch.handle) {
+    char *msg = LOAD_ERR();
+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Nvidia GPUs: %s",
+             nvml_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+//   LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
+  
+    LOG(1, "XXX wiring functions nvml_init\n");
+
+  for (i = 0; l[i].s != NULL; i++) {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
+    if (!l[i].p) {
+      resp->ch.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->ch.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
+               msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+    LOG(1, "XXX calling init_v2\n");
+
+  ret = (*resp->ch.nvmlInit_v2)();
+  if (ret != NVML_SUCCESS) {
+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->ch.handle);
+    resp->ch.handle = NULL;
+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+      LOG(1, "XXX nvml_init done\n");
+
+}
+
+
+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
+    nvmlDevice_t device;
+    nvmlMemory_t memInfo = {0};
+    nvmlReturn_t ret;
+    LOG(1, "XXX in nvml_get_free\n");
+    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "unable to get device handle %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+
+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
+    if (ret != NVML_SUCCESS) {
+        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
+        *free = 0;
+        return;
+    }
+    *free = memInfo.free;
+    *total = memInfo.total;
+    *used = memInfo.used;
+}
+
+
+void nvml_release(nvml_handle_t h) {
+  LOG(h.verbose, "releasing nvml library\n");
+  nvmlReturn_t ret;
+  ret = (*h.nvmlShutdown)();
+  if (ret != NVML_SUCCESS) {
+    LOG(1, "error during nvmlShutdown %d", ret);
+  }
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
+#endif  // __APPLE__

+ 48 - 0
gpu/gpu_info_nvml.h

@@ -0,0 +1,48 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_NVML_H__
+#define __GPU_INFO_NVML_H__
+#include "gpu_info.h"
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum nvmlReturn_enum {
+  NVML_SUCCESS = 0,
+  // Other values omitted for now...
+} nvmlReturn_t;
+typedef void *nvmlDevice_t;  // Opaque is sufficient
+typedef struct nvmlMemory_st {
+  unsigned long long total;
+  unsigned long long free;
+  unsigned long long used;
+} nvmlMemory_t;
+
+typedef enum nvmlBrandType_enum
+{
+    NVML_BRAND_UNKNOWN          = 0,
+} nvmlBrandType_t;
+
+typedef struct nvml_handle {
+  void *handle;
+  uint16_t verbose;
+  nvmlReturn_t (*nvmlInit_v2)(void);
+  nvmlReturn_t (*nvmlShutdown)(void);
+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
+} nvml_handle_t;
+
+typedef struct nvml_init_resp {
+  char *err;  // If err is non-null handle is invalid
+  nvml_handle_t ch;
+} nvml_init_resp_t;
+
+typedef struct nvml_compute_capability {
+  char *err;
+  int major;
+  int minor;
+} nvml_compute_capability_t;
+
+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
+void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
+void nvml_release(nvml_handle_t ch);
+
+#endif  // __GPU_INFO_NVML_H__
+#endif  // __APPLE__

+ 4 - 2
server/sched.go

@@ -487,8 +487,10 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	finished := make(chan interface{}, 1)
 
-	// CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
-	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
+	// CPU or Metal don't need checking, so no waiting required
+	// windows can page VRAM, only cuda currently can report accurate used vram usage
+	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
+		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
 		finished <- struct{}{}
 		return finished
 	}