9 months ago · f6f759fc5f
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -274,6 +274,28 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo.DriverMajor = driverMajor
			
 
				 				gpuInfo.DriverMinor = driverMinor
			
 
				 
			
 
				+				// query the management library as well so we can record any skew between the two
			
 
				+				// which represents overhead on the GPU we must set aside on subsequent updates
			
 
				+				if cHandles.nvml != nil {
			
 
				+					C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
			
 
				+					if memInfo.err != nil {
			
 
				+						slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
			
 
				+						C.free(unsafe.Pointer(memInfo.err))
			
 
				+					} else {
			
 
				+						if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
			
 
				+							gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
			
 
				+							slog.Info("detected OS VRAM overhead",
			
 
				+								"id", gpuInfo.ID,
			
 
				+								"library", gpuInfo.Library,
			
 
				+								"compute", gpuInfo.Compute,
			
 
				+								"driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
			
 
				+								"name", gpuInfo.Name,
			
 
				+								"overhead", format.HumanBytes2(gpuInfo.OSOverhead),
			
 
				+							)
			
 
				+						}
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				 				// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
			
 
				 				cudaGPUs = append(cudaGPUs, gpuInfo)
			
 
				 			}
			
@@ -374,9 +396,14 @@ func GetGPUInfo() GpuInfoList {
 
				 				slog.Warn("error looking up nvidia GPU memory")
			
 
				 				continue
			
 
				 			}
			
 
				+			if cHandles.nvml != nil && gpu.OSOverhead > 0 {
			
 
				+				// When using the management library update based on recorded overhead
			
 
				+				memInfo.free -= C.uint64_t(gpu.OSOverhead)
			
 
				+			}
			
 
				 			slog.Debug("updating cuda memory data",
			
 
				 				"gpu", gpu.ID,
			
 
				 				"name", gpu.Name,
			
 
				+				"overhead", format.HumanBytes2(gpu.OSOverhead),
			
 
				 				slog.Group(
			
 
				 					"before",
			
 
				 					"total", format.HumanBytes2(gpu.TotalMemory),
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -52,7 +52,8 @@ type CPUInfo struct {
 
				 
			
 
				 type CudaGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	index int //nolint:unused,nolintlint
			
 
				+	OSOverhead uint64 // Memory overhead between the driver library and management library
			
 
				+	index      int    //nolint:unused,nolintlint
			
 
				 }
			
 
				 type CudaGPUInfoList []CudaGPUInfo