Browse Source

Release gpu discovery library after use

Leaving the cudart library loaded kept ~30m of memory
pinned in the GPU in the main process.  This change ensures
we don't hold GPU resources when idle.
Daniel Hiltgen 1 year ago
parent
commit
526d4eb204
5 changed files with 31 additions and 10 deletions
  1. 16 10
      gpu/gpu.go
  2. 6 0
      gpu/gpu_info_cudart.c
  3. 1 0
      gpu/gpu_info_cudart.h
  4. 7 0
      gpu/gpu_info_nvml.c
  5. 1 0
      gpu/gpu_info_nvml.h

+ 16 - 10
gpu/gpu.go

@@ -35,7 +35,6 @@ const (
 )
 
 var gpuMutex sync.Mutex
-var gpuHandles *handles = nil
 
 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
@@ -85,11 +84,11 @@ var CudartWindowsGlobs = []string{
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
 
 // Note: gpuMutex must already be held
-func initGPUHandles() {
+func initGPUHandles() *handles {
 
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
 
-	gpuHandles = &handles{nil, nil}
+	gpuHandles := &handles{nil, nil}
 	var nvmlMgmtName string
 	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
@@ -116,7 +115,7 @@ func initGPUHandles() {
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
 	default:
-		return
+		return gpuHandles
 	}
 
 	slog.Info("Detecting GPU type")
@@ -126,7 +125,7 @@ func initGPUHandles() {
 		if cudart != nil {
 			slog.Info("Nvidia GPU detected via cudart")
 			gpuHandles.cudart = cudart
-			return
+			return gpuHandles
 		}
 	}
 
@@ -137,10 +136,10 @@ func initGPUHandles() {
 		if nvml != nil {
 			slog.Info("Nvidia GPU detected via nvidia-ml")
 			gpuHandles.nvml = nvml
-			return
+			return gpuHandles
 		}
 	}
-
+	return gpuHandles
 }
 
 func GetGPUInfo() GpuInfo {
@@ -148,9 +147,16 @@ func GetGPUInfo() GpuInfo {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-	if gpuHandles == nil {
-		initGPUHandles()
-	}
+
+	gpuHandles := initGPUHandles()
+	defer func() {
+		if gpuHandles.nvml != nil {
+			C.nvml_release(*gpuHandles.nvml)
+		}
+		if gpuHandles.cudart != nil {
+			C.cudart_release(*gpuHandles.cudart)
+		}
+	}()
 
 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()

+ 6 - 0
gpu/gpu_info_cudart.c

@@ -191,4 +191,10 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r
   }
 }
 
+void cudart_release(cudart_handle_t h) {
+  LOG(h.verbose, "releasing cudart library\n");
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
 #endif  // __APPLE__

+ 1 - 0
gpu/gpu_info_cudart.h

@@ -55,6 +55,7 @@ typedef struct cudart_compute_capability {
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
 void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
 void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
+void cudart_release(cudart_handle_t ch);
 
 #endif  // __GPU_INFO_CUDART_H__
 #endif  // __APPLE__

+ 7 - 0
gpu/gpu_info_nvml.c

@@ -211,4 +211,11 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
     }
   }
 }
+
+void nvml_release(nvml_handle_t h) {
+  LOG(h.verbose, "releasing nvml library\n");
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
 #endif  // __APPLE__

+ 1 - 0
gpu/gpu_info_nvml.h

@@ -51,6 +51,7 @@ typedef struct nvml_compute_capability {
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
 void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
 void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
+void nvml_release(nvml_handle_t ch);
 
 #endif  // __GPU_INFO_NVML_H__
 #endif  // __APPLE__