|
@@ -274,6 +274,28 @@ func GetGPUInfo() GpuInfoList {
|
|
gpuInfo.DriverMajor = driverMajor
|
|
gpuInfo.DriverMajor = driverMajor
|
|
gpuInfo.DriverMinor = driverMinor
|
|
gpuInfo.DriverMinor = driverMinor
|
|
|
|
|
|
|
|
+ // query the management library as well so we can record any skew between the two
|
|
|
|
+ // which represents overhead on the GPU we must set aside on subsequent updates
|
|
|
|
+ if cHandles.nvml != nil {
|
|
|
|
+ C.nvml_get_free(*cHandles.nvml, C.int(gpuInfo.index), &memInfo.free, &memInfo.total, &memInfo.used)
|
|
|
|
+ if memInfo.err != nil {
|
|
|
|
+ slog.Warn("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
|
|
|
|
+ C.free(unsafe.Pointer(memInfo.err))
|
|
|
|
+ } else {
|
|
|
|
+ if memInfo.free != 0 && uint64(memInfo.free) > gpuInfo.FreeMemory {
|
|
|
|
+ gpuInfo.OSOverhead = uint64(memInfo.free) - gpuInfo.FreeMemory
|
|
|
|
+ slog.Info("detected OS VRAM overhead",
|
|
|
|
+ "id", gpuInfo.ID,
|
|
|
|
+ "library", gpuInfo.Library,
|
|
|
|
+ "compute", gpuInfo.Compute,
|
|
|
|
+ "driver", fmt.Sprintf("%d.%d", gpuInfo.DriverMajor, gpuInfo.DriverMinor),
|
|
|
|
+ "name", gpuInfo.Name,
|
|
|
|
+ "overhead", format.HumanBytes2(gpuInfo.OSOverhead),
|
|
|
|
+ )
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
|
// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
|
|
cudaGPUs = append(cudaGPUs, gpuInfo)
|
|
cudaGPUs = append(cudaGPUs, gpuInfo)
|
|
}
|
|
}
|
|
@@ -374,9 +396,14 @@ func GetGPUInfo() GpuInfoList {
|
|
slog.Warn("error looking up nvidia GPU memory")
|
|
slog.Warn("error looking up nvidia GPU memory")
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
+ if cHandles.nvml != nil && gpu.OSOverhead > 0 {
|
|
|
|
+ // When using the management library update based on recorded overhead
|
|
|
|
+ memInfo.free -= C.uint64_t(gpu.OSOverhead)
|
|
|
|
+ }
|
|
slog.Debug("updating cuda memory data",
|
|
slog.Debug("updating cuda memory data",
|
|
"gpu", gpu.ID,
|
|
"gpu", gpu.ID,
|
|
"name", gpu.Name,
|
|
"name", gpu.Name,
|
|
|
|
+ "overhead", format.HumanBytes2(gpu.OSOverhead),
|
|
slog.Group(
|
|
slog.Group(
|
|
"before",
|
|
"before",
|
|
"total", format.HumanBytes2(gpu.TotalMemory),
|
|
"total", format.HumanBytes2(gpu.TotalMemory),
|