11 months ago · 5e8ff556cb
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -53,6 +53,8 @@ var (
 
				 	NumParallel int
			
 
				 	// Set via OLLAMA_RUNNERS_DIR in the environment
			
 
				 	RunnersDir string
			
 
				+	// Set via OLLAMA_SCHED_SPREAD in the environment
			
 
				+	SchedSpread bool
			
 
				 	// Set via OLLAMA_TMPDIR in the environment
			
 
				 	TmpDir string
			
 
				 )
			
@@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar {
 
				 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
			
 
				 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
			
 
				 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
			
 
				+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
			
 
				 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
			
 
				 	}
			
 
				 }
			
@@ -191,6 +194,15 @@ func LoadConfig() {
 
				 		NoHistory = true
			
 
				 	}
			
 
				 
			
 
				+	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
			
 
				+		s, err := strconv.ParseBool(spread)
			
 
				+		if err == nil {
			
 
				+			SchedSpread = s
			
 
				+		} else {
			
 
				+			SchedSpread = true
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
			
 
				 		NoPrune = true
			
 
				 	}
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -558,10 +558,12 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 
				 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
			
 
				 
			
 
				 		// First attempt to fit the model into a single GPU
			
 
				-		for _, g := range sgl {
			
 
				-			if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				-				slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				-				return []gpu.GpuInfo{g}
			
 
				+		if !envconfig.SchedSpread {
			
 
				+			for _, g := range sgl {
			
 
				+				if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+					slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				+					return []gpu.GpuInfo{g}
			
 
				+				}
			
 
				 			}
			
 
				 		}