11 달 전 · 434dfe30c5
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -28,6 +28,7 @@ type cudaHandles struct {
 
				 	deviceCount int
			
 
				 	cudart      *C.cudart_handle_t
			
 
				 	nvcuda      *C.nvcuda_handle_t
			
 
				+	nvml        *C.nvml_handle_t
			
 
				 }
			
 
				 
			
 
				 type oneapiHandles struct {
			
@@ -50,6 +51,7 @@ var (
 
				 	nvcudaLibPath string
			
 
				 	cudartLibPath string
			
 
				 	oneapiLibPath string
			
 
				+	nvmlLibPath   string
			
 
				 	rocmGPUs      []RocmGPUInfo
			
 
				 	oneapiGPUs    []OneapiGPUInfo
			
 
				 )
			
@@ -81,6 +83,10 @@ var CudartWindowsGlobs = []string{
 
				 	"c:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*\\bin\\cudart64_*.dll",
			
 
				 }
			
 
				 
			
 
				+var NvmlWindowsGlobs = []string{
			
 
				+	"c:\\Windows\\System32\\nvml.dll",
			
 
				+}
			
 
				+
			
 
				 var NvcudaLinuxGlobs = []string{
			
 
				 	"/usr/local/cuda*/targets/*/lib/libcuda.so*",
			
 
				 	"/usr/lib/*-linux-gnu/nvidia/current/libcuda.so*",
			
@@ -117,6 +123,10 @@ func initCudaHandles() *cudaHandles {
 
				 
			
 
				 	cHandles := &cudaHandles{}
			
 
				 	// Short Circuit if we already know which library to use
			
 
				+	if nvmlLibPath != "" {
			
 
				+		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
			
 
				+		return cHandles
			
 
				+	}
			
 
				 	if nvcudaLibPath != "" {
			
 
				 		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
			
 
				 		return cHandles
			
@@ -131,6 +141,8 @@ func initCudaHandles() *cudaHandles {
 
				 	var cudartMgmtPatterns []string
			
 
				 	var nvcudaMgmtName string
			
 
				 	var nvcudaMgmtPatterns []string
			
 
				+	var nvmlMgmtName string
			
 
				+	var nvmlMgmtPatterns []string
			
 
				 
			
 
				 	tmpDir, _ := PayloadsDir()
			
 
				 	switch runtime.GOOS {
			
@@ -142,6 +154,12 @@ func initCudaHandles() *cudaHandles {
 
				 		// Aligned with driver, we can't carry as payloads
			
 
				 		nvcudaMgmtName = "nvcuda.dll"
			
 
				 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
			
 
				+
			
 
				+		// Use nvml to refresh free memory on windows only
			
 
				+		nvmlMgmtName = "nvml.dll"
			
 
				+		nvmlMgmtPatterns = make([]string, len(NvmlWindowsGlobs))
			
 
				+		copy(nvmlMgmtPatterns, NvmlWindowsGlobs)
			
 
				+
			
 
				 	case "linux":
			
 
				 		cudartMgmtName = "libcudart.so*"
			
 
				 		if tmpDir != "" {
			
@@ -152,10 +170,24 @@ func initCudaHandles() *cudaHandles {
 
				 		// Aligned with driver, we can't carry as payloads
			
 
				 		nvcudaMgmtName = "libcuda.so*"
			
 
				 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
			
 
				+
			
 
				+		// nvml omitted on linux
			
 
				 	default:
			
 
				 		return cHandles
			
 
				 	}
			
 
				 
			
 
				+	if len(nvmlMgmtPatterns) > 0 {
			
 
				+		nvmlLibPaths := FindGPULibs(nvmlMgmtName, nvmlMgmtPatterns)
			
 
				+		if len(nvmlLibPaths) > 0 {
			
 
				+			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
			
 
				+			if nvml != nil {
			
 
				+				slog.Debug("nvidia-ml loaded", "library", libPath)
			
 
				+				cHandles.nvml = nvml
			
 
				+				nvmlLibPath = libPath
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
			
 
				 	if len(nvcudaLibPaths) > 0 {
			
 
				 		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
			
@@ -230,6 +262,9 @@ func GetGPUInfo() GpuInfoList {
 
				 			if cHandles.nvcuda != nil {
			
 
				 				C.nvcuda_release(*cHandles.nvcuda)
			
 
				 			}
			
 
				+			if cHandles.nvml != nil {
			
 
				+				C.nvml_release(*cHandles.nvml)
			
 
				+			}
			
 
				 		}
			
 
				 		if oHandles != nil {
			
 
				 			if oHandles.oneapi != nil {
			
@@ -365,10 +400,17 @@ func GetGPUInfo() GpuInfoList {
 
				 			cHandles = initCudaHandles()
			
 
				 		}
			
 
				 		for i, gpu := range cudaGPUs {
			
 
				-			if cHandles.cudart != nil {
			
 
				+			if cHandles.nvml != nil {
			
 
				+				C.nvml_get_free(*cHandles.nvml, C.int(gpu.index), &memInfo.free, &memInfo.total, &memInfo.used)
			
 
				+			} else if cHandles.cudart != nil {
			
 
				 				C.cudart_bootstrap(*cHandles.cudart, C.int(gpu.index), &memInfo)
			
 
				+			} else if cHandles.nvcuda != nil {
			
 
				+				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free, &memInfo.total)
			
 
				+				memInfo.used = memInfo.total - memInfo.free
			
 
				 			} else {
			
 
				-				C.nvcuda_get_free(*cHandles.nvcuda, C.int(gpu.index), &memInfo.free)
			
 
				+				// shouldn't happen
			
 
				+				slog.Warn("no valid cuda library loaded to refresh vram usage")
			
 
				+				break
			
 
				 			}
			
 
				 			if memInfo.err != nil {
			
 
				 				slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
			
@@ -379,7 +421,21 @@ func GetGPUInfo() GpuInfoList {
 
				 				slog.Warn("error looking up nvidia GPU memory")
			
 
				 				continue
			
 
				 			}
			
 
				-			slog.Debug("updating cuda free memory", "gpu", gpu.ID, "name", gpu.Name, "before", format.HumanBytes2(gpu.FreeMemory), "now", format.HumanBytes2(uint64(memInfo.free)))
			
 
				+			slog.Debug("updating cuda memory data",
			
 
				+				"gpu", gpu.ID,
			
 
				+				"name", gpu.Name,
			
 
				+				slog.Group(
			
 
				+					"before",
			
 
				+					"total", format.HumanBytes2(gpu.TotalMemory),
			
 
				+					"free", format.HumanBytes2(gpu.FreeMemory),
			
 
				+				),
			
 
				+				slog.Group(
			
 
				+					"now",
			
 
				+					"total", format.HumanBytes2(uint64(memInfo.total)),
			
 
				+					"free", format.HumanBytes2(uint64(memInfo.free)),
			
 
				+					"used", format.HumanBytes2(uint64(memInfo.used)),
			
 
				+				),
			
 
				+			)
			
 
				 			cudaGPUs[i].FreeMemory = uint64(memInfo.free)
			
 
				 		}
			
 
				 
			
@@ -530,6 +586,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 
				 	return 0, nil, ""
			
 
				 }
			
 
				 
			
 
				+func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
			
 
				+	var resp C.nvml_init_resp_t
			
 
				+	resp.ch.verbose = getVerboseState()
			
 
				+	for _, libPath := range nvmlLibPaths {
			
 
				+		lib := C.CString(libPath)
			
 
				+		defer C.free(unsafe.Pointer(lib))
			
 
				+		C.nvml_init(lib, &resp)
			
 
				+		if resp.err != nil {
			
 
				+			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
			
 
				+			C.free(unsafe.Pointer(resp.err))
			
 
				+		} else {
			
 
				+			return &resp.ch, libPath
			
 
				+		}
			
 
				+	}
			
 
				+	return nil, ""
			
 
				+}
			
 
				+
			
 
				 func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
			
 
				 	var resp C.oneapi_init_resp_t
			
 
				 	num_devices := 0
			
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -47,6 +47,7 @@ typedef struct mem_info {
 
				   char gpu_name[GPU_NAME_LEN];
			
 
				   uint64_t total;
			
 
				   uint64_t free;
			
 
				+  uint64_t used;
			
 
				 
			
 
				   // Compute Capability
			
 
				   int major; 
			
@@ -62,6 +63,7 @@ void cpu_check_ram(mem_info_t *resp);
 
				 
			
 
				 #include "gpu_info_cudart.h"
			
 
				 #include "gpu_info_nvcuda.h"
			
 
				+#include "gpu_info_nvml.h"
			
 
				 #include "gpu_info_oneapi.h"
			
 
				 
			
 
				 #endif  // __GPU_INFO_H__
			
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@@ -166,9 +166,11 @@ void cudart_bootstrap(cudart_handle_t h, int i, mem_info_t *resp) {
 
				 
			
 
				   resp->total = memInfo.total;
			
 
				   resp->free = memInfo.free;
			
 
				+  resp->used = memInfo.used;
			
 
				 
			
 
				   LOG(h.verbose, "[%s] CUDA totalMem %lu\n", resp->gpu_id, resp->total);
			
 
				   LOG(h.verbose, "[%s] CUDA freeMem %lu\n", resp->gpu_id, resp->free);
			
 
				+  LOG(h.verbose, "[%s] CUDA usedMem %lu\n", resp->gpu_id, resp->used);
			
 
				   LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);
			
 
				 }
			
 
				 
			
--- a/gpu/gpu_info_nvcuda.c
+++ b/gpu/gpu_info_nvcuda.c
@@ -197,12 +197,12 @@ void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
 
				   }
			
 
				 }
			
 
				 
			
 
				-void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
			
 
				+void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
			
 
				   CUresult ret;
			
 
				   CUcontext ctx = NULL;
			
 
				   CUdevice device = -1;
			
 
				   *free = 0;
			
 
				-  uint64_t total = 0;
			
 
				+  *total = 0;
			
 
				 
			
 
				   ret = (*h.cuDeviceGet)(&device, i);
			
 
				   if (ret != CUDA_SUCCESS) {
			
@@ -218,7 +218,7 @@ void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free) {
 
				     return;
			
 
				   }
			
 
				 
			
 
				-  ret = (*h.cuMemGetInfo_v2)(free, &total);
			
 
				+  ret = (*h.cuMemGetInfo_v2)(free, total);
			
 
				   if (ret != CUDA_SUCCESS) {
			
 
				     LOG(1, "nvcuda device memory info lookup failure %d", ret);
			
 
				     // Best effort on failure...
			
--- a/gpu/gpu_info_nvcuda.h
+++ b/gpu/gpu_info_nvcuda.h
@@ -68,7 +68,7 @@ typedef struct nvcuda_init_resp {
 
				 
			
 
				 void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp);
			
 
				 void nvcuda_bootstrap(nvcuda_handle_t ch, int device_id, mem_info_t *resp);
			
 
				-void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free);
			
 
				+void nvcuda_get_free(nvcuda_handle_t ch,  int device_id, uint64_t *free, uint64_t *total);
			
 
				 void nvcuda_release(nvcuda_handle_t ch);
			
 
				 
			
 
				 #endif  // __GPU_INFO_NVCUDA_H__
			
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@@ -0,0 +1,112 @@
 
				+#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?
			
 
				+
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include "gpu_info_nvml.h"
			
 
				+
			
 
				+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp) {
			
 
				+  nvmlReturn_t ret;
			
 
				+  resp->err = NULL;
			
 
				+  const int buflen = 256;
			
 
				+  char buf[buflen + 1];
			
 
				+  int i;
			
 
				+
			
 
				+  LOG(1, "XXX starting nvml_init %s\n", nvml_lib_path);
			
 
				+
			
 
				+  struct lookup {
			
 
				+    char *s;
			
 
				+    void **p;
			
 
				+  } l[] = {
			
 
				+      {"nvmlInit_v2", (void *)&resp->ch.nvmlInit_v2},
			
 
				+      {"nvmlShutdown", (void *)&resp->ch.nvmlShutdown},
			
 
				+      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.nvmlDeviceGetHandleByIndex},
			
 
				+      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.nvmlDeviceGetMemoryInfo},
			
 
				+      {NULL, NULL},
			
 
				+  };
			
 
				+
			
 
				+  resp->ch.handle = LOAD_LIBRARY(nvml_lib_path, RTLD_LAZY);
			
 
				+  if (!resp->ch.handle) {
			
 
				+    char *msg = LOAD_ERR();
			
 
				+    LOG(resp->ch.verbose, "library %s load err: %s\n", nvml_lib_path, msg);
			
 
				+    snprintf(buf, buflen,
			
 
				+             "Unable to load %s library to query for Nvidia GPUs: %s",
			
 
				+             nvml_lib_path, msg);
			
 
				+    free(msg);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+
			
 
				+  // TODO once we've squashed the remaining corner cases remove this log
			
 
				+//   LOG(resp->ch.verbose, "wiring nvidia management library functions in %s\n", nvml_lib_path);
			
 
				+  
			
 
				+    LOG(1, "XXX wiring functions nvml_init\n");
			
 
				+
			
 
				+  for (i = 0; l[i].s != NULL; i++) {
			
 
				+    // TODO once we've squashed the remaining corner cases remove this log
			
 
				+    LOG(resp->ch.verbose, "dlsym: %s\n", l[i].s);
			
 
				+
			
 
				+    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
			
 
				+    if (!l[i].p) {
			
 
				+      resp->ch.handle = NULL;
			
 
				+      char *msg = LOAD_ERR();
			
 
				+      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
			
 
				+      UNLOAD_LIBRARY(resp->ch.handle);
			
 
				+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
			
 
				+               msg);
			
 
				+      free(msg);
			
 
				+      resp->err = strdup(buf);
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+    LOG(1, "XXX calling init_v2\n");
			
 
				+
			
 
				+  ret = (*resp->ch.nvmlInit_v2)();
			
 
				+  if (ret != NVML_SUCCESS) {
			
 
				+    LOG(resp->ch.verbose, "nvmlInit_v2 err: %d\n", ret);
			
 
				+    UNLOAD_LIBRARY(resp->ch.handle);
			
 
				+    resp->ch.handle = NULL;
			
 
				+    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
			
 
				+    resp->err = strdup(buf);
			
 
				+    return;
			
 
				+  }
			
 
				+      LOG(1, "XXX nvml_init done\n");
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nvml_get_free(nvml_handle_t h, int device_id, uint64_t *free, uint64_t *total, uint64_t *used) {
			
 
				+    nvmlDevice_t device;
			
 
				+    nvmlMemory_t memInfo = {0};
			
 
				+    nvmlReturn_t ret;
			
 
				+    LOG(1, "XXX in nvml_get_free\n");
			
 
				+    ret = (*h.nvmlDeviceGetHandleByIndex)(device_id, &device);
			
 
				+    if (ret != NVML_SUCCESS) {
			
 
				+        LOG(1, "unable to get device handle %d: %d", device_id, ret);
			
 
				+        *free = 0;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    ret = (*h.nvmlDeviceGetMemoryInfo)(device, &memInfo);
			
 
				+    if (ret != NVML_SUCCESS) {
			
 
				+        LOG(1, "device memory info lookup failure %d: %d", device_id, ret);
			
 
				+        *free = 0;
			
 
				+        return;
			
 
				+    }
			
 
				+    *free = memInfo.free;
			
 
				+    *total = memInfo.total;
			
 
				+    *used = memInfo.used;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void nvml_release(nvml_handle_t h) {
			
 
				+  LOG(h.verbose, "releasing nvml library\n");
			
 
				+  nvmlReturn_t ret;
			
 
				+  ret = (*h.nvmlShutdown)();
			
 
				+  if (ret != NVML_SUCCESS) {
			
 
				+    LOG(1, "error during nvmlShutdown %d", ret);
			
 
				+  }
			
 
				+  UNLOAD_LIBRARY(h.handle);
			
 
				+  h.handle = NULL;
			
 
				+}
			
 
				+
			
 
				+#endif  // __APPLE__
			
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@@ -0,0 +1,48 @@
 
				+#ifndef __APPLE__
			
 
				+#ifndef __GPU_INFO_NVML_H__
			
 
				+#define __GPU_INFO_NVML_H__
			
 
				+#include "gpu_info.h"
			
 
				+
			
 
				+// Just enough typedef's to dlopen/dlsym for memory information
			
 
				+typedef enum nvmlReturn_enum {
			
 
				+  NVML_SUCCESS = 0,
			
 
				+  // Other values omitted for now...
			
 
				+} nvmlReturn_t;
			
 
				+typedef void *nvmlDevice_t;  // Opaque is sufficient
			
 
				+typedef struct nvmlMemory_st {
			
 
				+  unsigned long long total;
			
 
				+  unsigned long long free;
			
 
				+  unsigned long long used;
			
 
				+} nvmlMemory_t;
			
 
				+
			
 
				+typedef enum nvmlBrandType_enum
			
 
				+{
			
 
				+    NVML_BRAND_UNKNOWN          = 0,
			
 
				+} nvmlBrandType_t;
			
 
				+
			
 
				+typedef struct nvml_handle {
			
 
				+  void *handle;
			
 
				+  uint16_t verbose;
			
 
				+  nvmlReturn_t (*nvmlInit_v2)(void);
			
 
				+  nvmlReturn_t (*nvmlShutdown)(void);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetHandleByIndex)(unsigned int, nvmlDevice_t *);
			
 
				+  nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t *);
			
 
				+} nvml_handle_t;
			
 
				+
			
 
				+typedef struct nvml_init_resp {
			
 
				+  char *err;  // If err is non-null handle is invalid
			
 
				+  nvml_handle_t ch;
			
 
				+} nvml_init_resp_t;
			
 
				+
			
 
				+typedef struct nvml_compute_capability {
			
 
				+  char *err;
			
 
				+  int major;
			
 
				+  int minor;
			
 
				+} nvml_compute_capability_t;
			
 
				+
			
 
				+void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
			
 
				+void nvml_get_free(nvml_handle_t ch,  int device_id, uint64_t *free, uint64_t *total, uint64_t *used);
			
 
				+void nvml_release(nvml_handle_t ch);
			
 
				+
			
 
				+#endif  // __GPU_INFO_NVML_H__
			
 
				+#endif  // __APPLE__
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -487,8 +487,10 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 
				 func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
			
 
				 	finished := make(chan interface{}, 1)
			
 
				 
			
 
				-	// CPU or Metal don't need checking, so no waiting required, windows can page VRAM, and the APIs we query tend to be optimistic on free space
			
 
				-	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) || runtime.GOOS == "windows" {
			
 
				+	// CPU or Metal don't need checking, so no waiting required
			
 
				+	// windows can page VRAM, only cuda currently can report accurate used vram usage
			
 
				+	if (len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal")) ||
			
 
				+		(runtime.GOOS == "windows" && runner.gpus[0].Library != "cuda") {
			
 
				 		finished <- struct{}{}
			
 
				 		return finished
			
 
				 	}