Bläddra i källkod

Support forced spreading for multi GPU

Our default behavior today is to try to fit into a single GPU if possible.
Some users would prefer the old behavior of always spreading across
multiple GPUs even if the model can fit into one.  This exposes that
tunable behavior.
Daniel Hiltgen 11 månader sedan
förälder
incheckning
5e8ff556cb
2 ändrade filer med 18 tillägg och 4 borttagningar
  1. 12 0
      envconfig/config.go
  2. 6 4
      server/sched.go

+ 12 - 0
envconfig/config.go

@@ -53,6 +53,8 @@ var (
 	NumParallel int
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
 	RunnersDir string
+	// Set via OLLAMA_SCHED_SPREAD in the environment
+	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
 	TmpDir string
 )
 )
@@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
 	}
 }
 }
@@ -191,6 +194,15 @@ func LoadConfig() {
 		NoHistory = true
 		NoHistory = true
 	}
 	}
 
 
+	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
+		s, err := strconv.ParseBool(spread)
+		if err == nil {
+			SchedSpread = s
+		} else {
+			SchedSpread = true
+		}
+	}
+
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 		NoPrune = true
 		NoPrune = true
 	}
 	}

+ 6 - 4
server/sched.go

@@ -558,10 +558,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
 
 
 		// First attempt to fit the model into a single GPU
 		// First attempt to fit the model into a single GPU
-		for _, g := range sgl {
-			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
-				slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
-				return []gpu.GpuInfo{g}
+		if !envconfig.SchedSpread {
+			for _, g := range sgl {
+				if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
+					return []gpu.GpuInfo{g}
+				}
 			}
 			}
 		}
 		}