|
@@ -558,10 +558,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
|
|
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
|
|
|
|
|
// First attempt to fit the model into a single GPU
|
|
|
- for _, g := range sgl {
|
|
|
- if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
- slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
- return []gpu.GpuInfo{g}
|
|
|
+ if !envconfig.SchedSpread {
|
|
|
+ for _, g := range sgl {
|
|
|
+ if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
+ slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
+ return []gpu.GpuInfo{g}
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|