|
@@ -81,7 +81,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
|
var err error
|
|
|
var cpuRunner string
|
|
|
var estimate MemoryEstimate
|
|
|
- var systemMemory uint64
|
|
|
+ var systemTotalMemory uint64
|
|
|
+ var systemFreeMemory uint64
|
|
|
+
|
|
|
+ systemMemInfo, err := gpu.GetCPUMem()
|
|
|
+ if err != nil {
|
|
|
+ slog.Error("failed to lookup system memory", "error", err)
|
|
|
+ } else {
|
|
|
+ systemTotalMemory = systemMemInfo.TotalMemory
|
|
|
+ systemFreeMemory = systemMemInfo.FreeMemory
|
|
|
+ slog.Debug("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", systemFreeMemory)
|
|
|
+ }
|
|
|
|
|
|
// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
|
|
|
if opts.NumGPU == 0 {
|
|
@@ -91,19 +101,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
|
cpuRunner = serverForCpu()
|
|
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
|
|
} else {
|
|
|
- if gpus[0].Library == "metal" {
|
|
|
- memInfo, err := gpu.GetCPUMem()
|
|
|
- if err != nil {
|
|
|
- slog.Error("failed to lookup system memory", "error", err)
|
|
|
- } else {
|
|
|
- systemMemory = memInfo.TotalMemory
|
|
|
- slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
|
|
|
- }
|
|
|
- }
|
|
|
estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
|
|
|
|
|
|
switch {
|
|
|
- case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
|
|
|
+ case gpus[0].Library == "metal" && estimate.VRAMSize > systemTotalMemory:
|
|
|
// disable partial offloading when model is greater than total system memory as this
|
|
|
// can lead to locking up the system
|
|
|
opts.NumGPU = 0
|
|
@@ -211,7 +212,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
|
}
|
|
|
|
|
|
// Windows CUDA should not use mmap for best performance
|
|
|
- if (runtime.GOOS == "windows" && gpus[0].Library == "cuda") || opts.UseMMap == api.TriStateFalse {
|
|
|
+ // Linux with a model larger than free space, mmap leads to thrashing
|
|
|
+ if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
|
|
|
+ (runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
|
|
|
+ opts.UseMMap == api.TriStateFalse {
|
|
|
params = append(params, "--no-mmap")
|
|
|
}
|
|
|
|