10 ヶ月前 · 9929751cc8
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar {
 
				 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
			
 
				 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
			
 
				 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
			
 
				-		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default 4)"},
			
 
				+		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"},
			
 
				 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
			
 
				 		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
			
 
				 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
			
 
				 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
			
 
				 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
			
 
				-		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
			
 
				+		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"},
			
 
				 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
			
 
				 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
			
 
				 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
			
@@ -129,8 +129,8 @@ func clean(key string) string {
 
				 
			
 
				 func init() {
			
 
				 	// default values
			
 
				-	NumParallel = 0
			
 
				-	MaxRunners = 4
			
 
				+	NumParallel = 0 // Autoselect
			
 
				+	MaxRunners = 0  // Autoselect
			
 
				 	MaxQueuedRequests = 512
			
 
				 
			
 
				 	LoadConfig()
			
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -115,8 +115,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		// TODO revisit this once ROCm v6 is available on windows.
			
 
				-		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
			
 
				 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
			
 
				 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
			
 
				 		gpuInfo := RocmGPUInfo{
			
@@ -126,6 +124,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 					TotalMemory: totalMemory,
			
 
				 					FreeMemory:  freeMemory,
			
 
				 				},
			
 
				+				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
			
 
				+				UnreliableFreeMemory: true,
			
 
				+
			
 
				 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
			
 
				 				DependencyPath: libDir,
			
 
				 				MinimumMemory:  rocmMinimumMemory,
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -29,6 +29,11 @@ type GpuInfo struct {
 
				 	// Extra environment variables specific to the GPU as list of [key,value]
			
 
				 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
			
 
				 
			
 
				+	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
			
 
				+	// the FreeMemory is best effort, and may over or under report actual memory usage
			
 
				+	// False indicates FreeMemory can generally be trusted on this GPU
			
 
				+	UnreliableFreeMemory bool
			
 
				+
			
 
				 	// GPU information
			
 
				 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
			
 
				 	Name    string `json:"name"`    // user friendly name if available
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -46,6 +46,16 @@ type Scheduler struct {
 
				 	reschedDelay time.Duration
			
 
				 }
			
 
				 
			
 
				+// Default automatic value for number of models we allow per GPU
			
 
				+// Model will still need to fit in VRAM, but loading many small models
			
 
				+// on a large GPU can cause stalling
			
 
				+var defaultModelsPerGPU = 3
			
 
				+
			
 
				+// Default automatic value for parallel setting
			
 
				+// Model will still need to fit in VRAM.  If this setting wont fit
			
 
				+// we'll back off down to 1 to try to get it to fit
			
 
				+var defaultParallel = 4
			
 
				+
			
 
				 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
			
 
				 
			
 
				 func InitScheduler(ctx context.Context) *Scheduler {
			
@@ -100,7 +110,6 @@ func (s *Scheduler) Run(ctx context.Context) {
 
				 }
			
 
				 
			
 
				 func (s *Scheduler) processPending(ctx context.Context) {
			
 
				-	maxRunnerFactor := 1 // number of GPUs or 1
			
 
				 	for {
			
 
				 		select {
			
 
				 		case <-ctx.Done():
			
@@ -143,7 +152,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						pending.useLoadedRunner(runner, s.finishedReqCh)
			
 
				 						break
			
 
				 					}
			
 
				-				} else if envconfig.MaxRunners > 0 && loadedCount >= (maxRunnerFactor*envconfig.MaxRunners) {
			
 
				+				} else if envconfig.MaxRunners > 0 && loadedCount >= envconfig.MaxRunners {
			
 
				 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
			
 
				 					runnerToExpire = s.findRunnerToUnload()
			
 
				 				} else {
			
@@ -155,7 +164,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 					} else {
			
 
				 						gpus = s.getGpuFn()
			
 
				 					}
			
 
				-					maxRunnerFactor = max(len(gpus), 1)
			
 
				+
			
 
				+					if envconfig.MaxRunners <= 0 {
			
 
				+						// No user specified MaxRunners, so figure out what automatic setting to use
			
 
				+						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
			
 
				+						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
			
 
				+						allReliable := true
			
 
				+						for _, gpu := range gpus {
			
 
				+							if gpu.UnreliableFreeMemory {
			
 
				+								allReliable = false
			
 
				+								break
			
 
				+							}
			
 
				+						}
			
 
				+						if allReliable {
			
 
				+							envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
			
 
				+							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
			
 
				+						} else {
			
 
				+							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
			
 
				+							envconfig.MaxRunners = len(gpus)
			
 
				+						}
			
 
				+					}
			
 
				 
			
 
				 					// Load model for fitting
			
 
				 					ggml, err := llm.LoadModel(pending.model.ModelPath)
			
@@ -647,7 +675,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 
				 	var numParallelToTry []int
			
 
				 	if *numParallel <= 0 {
			
 
				 		// If no specific parallel setting was provided, try larger then smaller, always end with 1
			
 
				-		numParallelToTry = append(numParallelToTry, 4, 1)
			
 
				+		numParallelToTry = append(numParallelToTry, defaultParallel, 1)
			
 
				 	} else {
			
 
				 		numParallelToTry = []int{*numParallel}
			
 
				 	}