10 miesięcy temu · 3518aaef33
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -85,13 +85,13 @@ func AsMap() map[string]EnvVar {
 
				 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host, "IP Address for the ollama server (default 127.0.0.1:11434)"},
			
 
				 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
			
 
				 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
			
 
				-		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models (default 1)"},
			
 
				+		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU (default auto)"},
			
 
				 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
			
 
				 		"OLLAMA_MAX_VRAM":          {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
			
 
				 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
			
 
				 		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
			
 
				 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
			
 
				-		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
			
 
				+		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default auto)"},
			
 
				 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
			
 
				 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
			
 
				 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
			
@@ -129,8 +129,8 @@ func clean(key string) string {
 
				 
			
 
				 func init() {
			
 
				 	// default values
			
 
				-	NumParallel = 1
			
 
				-	MaxRunners = 1
			
 
				+	NumParallel = 0 // Autoselect
			
 
				+	MaxRunners = 0  // Autoselect
			
 
				 	MaxQueuedRequests = 512
			
 
				 
			
 
				 	LoadConfig()
			
@@ -205,8 +205,8 @@ func LoadConfig() {
 
				 
			
 
				 	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
			
 
				 		val, err := strconv.Atoi(onp)
			
 
				-		if err != nil || val <= 0 {
			
 
				-			slog.Error("invalid setting must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
			
 
				+		if err != nil {
			
 
				+			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
			
 
				 		} else {
			
 
				 			NumParallel = val
			
 
				 		}
			
@@ -251,7 +251,7 @@ func LoadConfig() {
 
				 	if maxRunners != "" {
			
 
				 		m, err := strconv.Atoi(maxRunners)
			
 
				 		if err != nil {
			
 
				-			slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
			
 
				+			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
			
 
				 		} else {
			
 
				 			MaxRunners = m
			
 
				 		}
			
@@ -260,7 +260,7 @@ func LoadConfig() {
 
				 	if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
			
 
				 		p, err := strconv.Atoi(onp)
			
 
				 		if err != nil || p <= 0 {
			
 
				-			slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
			
 
				+			slog.Error("invalid setting, ignoring", "OLLAMA_MAX_QUEUE", onp, "error", err)
			
 
				 		} else {
			
 
				 			MaxQueuedRequests = p
			
 
				 		}
			
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@@ -115,8 +115,6 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		// TODO revisit this once ROCm v6 is available on windows.
			
 
				-		// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
			
 
				 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
			
 
				 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
			
 
				 		gpuInfo := RocmGPUInfo{
			
@@ -126,6 +124,9 @@ func AMDGetGPUInfo() []RocmGPUInfo {
 
				 					TotalMemory: totalMemory,
			
 
				 					FreeMemory:  freeMemory,
			
 
				 				},
			
 
				+				// Free memory reporting on Windows is not reliable until we bump to ROCm v6.2
			
 
				+				UnreliableFreeMemory: true,
			
 
				+
			
 
				 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
			
 
				 				DependencyPath: libDir,
			
 
				 				MinimumMemory:  rocmMinimumMemory,
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -29,6 +29,11 @@ type GpuInfo struct {
 
				 	// Extra environment variables specific to the GPU as list of [key,value]
			
 
				 	EnvWorkarounds [][2]string `json:"envs,omitempty"`
			
 
				 
			
 
				+	// Set to true if we can NOT reliably discover FreeMemory.  A value of true indicates
			
 
				+	// the FreeMemory is best effort, and may over or under report actual memory usage
			
 
				+	// False indicates FreeMemory can generally be trusted on this GPU
			
 
				+	UnreliableFreeMemory bool
			
 
				+
			
 
				 	// GPU information
			
 
				 	ID      string `json:"gpu_id"`  // string to use for selection of this specific GPU
			
 
				 	Name    string `json:"name"`    // user friendly name if available
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -82,7 +82,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
				 
			
 
				 // NewLlamaServer will run a server for the given GPUs
			
 
				 // The gpu list must be a single family.
			
 
				-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
			
 
				+func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				 	var err error
			
 
				 	var cpuRunner string
			
 
				 	var estimate MemoryEstimate
			
@@ -218,8 +218,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 	// Windows CUDA should not use mmap for best performance
			
 
				 	// Linux  with a model larger than free space, mmap leads to thrashing
			
 
				+	// For CPU loads we want the memory to be allocated, not FS cache
			
 
				 	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
			
 
				 		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
			
 
				+		(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
			
 
				 		opts.UseMMap == api.TriStateFalse {
			
 
				 		params = append(params, "--no-mmap")
			
 
				 	}
			
@@ -232,15 +234,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		params = append(params, "--numa")
			
 
				 	}
			
 
				 
			
 
				-	numParallel := envconfig.NumParallel
			
 
				-
			
 
				-	// TODO (jmorganca): multimodal models don't support parallel yet
			
 
				-	// see https://github.com/ollama/ollama/issues/4165
			
 
				-	if len(projectors) > 0 {
			
 
				-		numParallel = 1
			
 
				-		slog.Warn("multimodal models don't support parallel requests yet")
			
 
				-	}
			
 
				-
			
 
				 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
			
 
				 
			
 
				 	if estimate.TensorSplit != "" {
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -1237,6 +1237,11 @@ func (s *Server) ProcessHandler(c *gin.Context) {
 
				 		models = append(models, mr)
			
 
				 	}
			
 
				 
			
 
				+	slices.SortStableFunc(models, func(i, j api.ProcessModelResponse) int {
			
 
				+		// longest duration remaining listed first
			
 
				+		return cmp.Compare(j.ExpiresAt.Unix(), i.ExpiresAt.Unix())
			
 
				+	})
			
 
				+
			
 
				 	c.JSON(http.StatusOK, api.ProcessResponse{Models: models})
			
 
				 }
			
 
				 
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -23,6 +23,7 @@ type LlmRequest struct {
 
				 	ctx             context.Context //nolint:containedctx
			
 
				 	model           *Model
			
 
				 	opts            api.Options
			
 
				+	origNumCTX      int // Track the initial ctx request
			
 
				 	sessionDuration time.Duration
			
 
				 	successCh       chan *runnerRef
			
 
				 	errCh           chan error
			
@@ -38,13 +39,23 @@ type Scheduler struct {
 
				 	loaded   map[string]*runnerRef
			
 
				 	loadedMu sync.Mutex
			
 
				 
			
 
				-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
			
 
				-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
			
 
				+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
			
 
				+	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
			
 
				 	getGpuFn     func() gpu.GpuInfoList
			
 
				 	getCpuFn     func() gpu.GpuInfoList
			
 
				 	reschedDelay time.Duration
			
 
				 }
			
 
				 
			
 
				+// Default automatic value for number of models we allow per GPU
			
 
				+// Model will still need to fit in VRAM, but loading many small models
			
 
				+// on a large GPU can cause stalling
			
 
				+var defaultModelsPerGPU = 3
			
 
				+
			
 
				+// Default automatic value for parallel setting
			
 
				+// Model will still need to fit in VRAM.  If this setting wont fit
			
 
				+// we'll back off down to 1 to try to get it to fit
			
 
				+var defaultParallel = 4
			
 
				+
			
 
				 var ErrMaxQueue = fmt.Errorf("server busy, please try again.  maximum pending requests exceeded")
			
 
				 
			
 
				 func InitScheduler(ctx context.Context) *Scheduler {
			
@@ -65,13 +76,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
 
				 
			
 
				 // context must be canceled to decrement ref count and release the runner
			
 
				 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
			
 
				-	// allocate a large enough kv cache for all parallel requests
			
 
				 	if opts.NumCtx < 4 {
			
 
				 		opts.NumCtx = 4
			
 
				 	}
			
 
				 
			
 
				-	opts.NumCtx *= envconfig.NumParallel
			
 
				-
			
 
				 	req := &LlmRequest{
			
 
				 		ctx:             c,
			
 
				 		model:           model,
			
@@ -110,11 +118,25 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 		case pending := <-s.pendingReqCh:
			
 
				 			// Block other requests until we get this pending request running
			
 
				 			pending.schedAttempts++
			
 
				+			if pending.origNumCTX == 0 {
			
 
				+				pending.origNumCTX = pending.opts.NumCtx
			
 
				+			}
			
 
				 
			
 
				 			if pending.ctx.Err() != nil {
			
 
				 				slog.Debug("pending request cancelled or timed out, skipping scheduling")
			
 
				 				continue
			
 
				 			}
			
 
				+			numParallel := envconfig.NumParallel
			
 
				+			// TODO (jmorganca): multimodal models don't support parallel yet
			
 
				+			// see https://github.com/ollama/ollama/issues/4165
			
 
				+			if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
			
 
				+				numParallel = 1
			
 
				+				slog.Warn("multimodal models don't support parallel requests yet")
			
 
				+			}
			
 
				+			// Keep NumCtx and numParallel in sync
			
 
				+			if numParallel > 1 {
			
 
				+				pending.opts.NumCtx = pending.origNumCTX * numParallel
			
 
				+			}
			
 
				 
			
 
				 			for {
			
 
				 				var runnerToExpire *runnerRef
			
@@ -143,6 +165,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						gpus = s.getGpuFn()
			
 
				 					}
			
 
				 
			
 
				+					if envconfig.MaxRunners <= 0 {
			
 
				+						// No user specified MaxRunners, so figure out what automatic setting to use
			
 
				+						// If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
			
 
				+						// if any GPU has unreliable free memory reporting, 1x the number of GPUs
			
 
				+						allReliable := true
			
 
				+						for _, gpu := range gpus {
			
 
				+							if gpu.UnreliableFreeMemory {
			
 
				+								allReliable = false
			
 
				+								break
			
 
				+							}
			
 
				+						}
			
 
				+						if allReliable {
			
 
				+							envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
			
 
				+							slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
			
 
				+						} else {
			
 
				+							slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
			
 
				+							envconfig.MaxRunners = len(gpus)
			
 
				+						}
			
 
				+					}
			
 
				+
			
 
				 					// Load model for fitting
			
 
				 					ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
			
 
				 					if err != nil {
			
@@ -152,26 +194,32 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 
			
 
				 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
			
 
				 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
			
 
				+						// simplifying assumption of defaultParallel when in CPU mode
			
 
				+						if numParallel <= 0 {
			
 
				+							numParallel = defaultParallel
			
 
				+							pending.opts.NumCtx = pending.origNumCTX * numParallel
			
 
				+						}
			
 
				+
			
 
				 						if loadedCount == 0 {
			
 
				 							slog.Debug("cpu mode with first model, loading")
			
 
				-							s.loadFn(pending, ggml, gpus)
			
 
				+							s.loadFn(pending, ggml, gpus, numParallel)
			
 
				 							break
			
 
				 						}
			
 
				 						runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
			
 
				 						if runnerToExpire == nil {
			
 
				 							slog.Debug("cpu mode with available system memory or first model, loading")
			
 
				-							s.loadFn(pending, ggml, gpus)
			
 
				+							s.loadFn(pending, ggml, gpus, numParallel)
			
 
				 							break
			
 
				 						}
			
 
				 						// else we need to expire a runner
			
 
				 					} else if loadedCount == 0 {
			
 
				 						// No models loaded. Load the model but prefer the best fit.
			
 
				 						slog.Debug("loading first model", "model", pending.model.ModelPath)
			
 
				-						g := pickBestFitGPUs(pending, ggml, gpus)
			
 
				+						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
			
 
				 						if g != nil {
			
 
				 							gpus = g
			
 
				 						}
			
 
				-						s.loadFn(pending, ggml, gpus)
			
 
				+						s.loadFn(pending, ggml, gpus, numParallel)
			
 
				 						break
			
 
				 					}
			
 
				 
			
@@ -186,10 +234,10 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 
			
 
				 						// Update free memory from currently loaded models
			
 
				 						s.updateFreeSpace(availGpus)
			
 
				-						fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
			
 
				+						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
			
 
				 						if fitGpus != nil {
			
 
				 							slog.Debug("new model fits with existing models, loading")
			
 
				-							s.loadFn(pending, ggml, fitGpus)
			
 
				+							s.loadFn(pending, ggml, fitGpus, numParallel)
			
 
				 							break
			
 
				 						}
			
 
				 
			
@@ -350,8 +398,11 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 
				 	}()
			
 
				 }
			
 
				 
			
 
				-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
			
 
				-	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
			
 
				+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
			
 
				+	if numParallel < 1 {
			
 
				+		numParallel = 1
			
 
				+	}
			
 
				+	llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
			
 
				 	if err != nil {
			
 
				 		// some older models are not compatible with newer versions of llama.cpp
			
 
				 		// show a generalized compatibility error until there is a better way to
			
@@ -375,6 +426,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
 
				 		loading:         true,
			
 
				 		refCount:        1,
			
 
				 	}
			
 
				+	runner.numParallel = numParallel
			
 
				 	runner.refMu.Lock()
			
 
				 
			
 
				 	s.loadedMu.Lock()
			
@@ -483,8 +535,9 @@ type runnerRef struct {
 
				 	expireTimer     *time.Timer
			
 
				 	expiresAt       time.Time
			
 
				 
			
 
				-	model     *Model
			
 
				-	modelPath string
			
 
				+	model       *Model
			
 
				+	modelPath   string
			
 
				+	numParallel int
			
 
				 	*api.Options
			
 
				 }
			
 
				 
			
@@ -525,6 +578,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 
				 		optsNew.NumGPU = -1
			
 
				 	}
			
 
				 
			
 
				+	// Normalize the NumCtx for parallelism
			
 
				+	optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
			
 
				+
			
 
				 	ctx, cancel := context.WithTimeout(ctx, timeout)
			
 
				 	defer cancel()
			
 
				 	if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
			
@@ -611,22 +667,38 @@ func (a ByDuration) Less(i, j int) bool {
 
				 
			
 
				 // pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
			
 
				 // If the model can not be fit fully within the available GPU(s) nil is returned
			
 
				-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
			
 
				+// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
			
 
				+// opts.NumCtx accordingly
			
 
				+func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				 	var estimatedVRAM uint64
			
 
				+
			
 
				+	var numParallelToTry []int
			
 
				+	if *numParallel <= 0 {
			
 
				+		// If no specific parallel setting was provided, try larger then smaller, always end with 1
			
 
				+		numParallelToTry = append(numParallelToTry, defaultParallel, 1)
			
 
				+	} else {
			
 
				+		numParallelToTry = []int{*numParallel}
			
 
				+	}
			
 
				+
			
 
				 	for _, gl := range gpus.ByLibrary() {
			
 
				 		var ok bool
			
 
				 		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
			
 
				 
			
 
				 		// TODO - potentially sort by performance capability, existing models loaded, etc.
			
 
				+		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
			
 
				 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
			
 
				 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
			
 
				 
			
 
				 		// First attempt to fit the model into a single GPU
			
 
				-		if !envconfig.SchedSpread {
			
 
				-			for _, g := range sgl {
			
 
				-				if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				-					slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				-					return []gpu.GpuInfo{g}
			
 
				+		for _, p := range numParallelToTry {
			
 
				+			req.opts.NumCtx = req.origNumCTX * p
			
 
				+			if !envconfig.SchedSpread {
			
 
				+				for _, g := range sgl {
			
 
				+					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				+						*numParallel = p
			
 
				+						return []gpu.GpuInfo{g}
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -636,9 +708,13 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 
				 		// - try subsets of GPUs instead of just falling back to 1 or all in a family
			
 
				 
			
 
				 		// Now try all the GPUs
			
 
				-		if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				-			slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
			
 
				-			return sgl
			
 
				+		for _, p := range numParallelToTry {
			
 
				+			req.opts.NumCtx = req.origNumCTX * p
			
 
				+			if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+				slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
			
 
				+				*numParallel = p
			
 
				+				return sgl
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 	return nil
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -47,11 +47,11 @@ func TestLoad(t *testing.T) {
 
				 		sessionDuration: 2,
			
 
				 	}
			
 
				 	// Fail to load model first
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return nil, fmt.Errorf("something failed to load model blah")
			
 
				 	}
			
 
				 	gpus := gpu.GpuInfoList{}
			
 
				-	s.load(req, ggml, gpus)
			
 
				+	s.load(req, ggml, gpus, 0)
			
 
				 	require.Empty(t, req.successCh)
			
 
				 	require.Len(t, req.errCh, 1)
			
 
				 	s.loadedMu.Lock()
			
@@ -61,10 +61,10 @@ func TestLoad(t *testing.T) {
 
				 	require.Contains(t, err.Error(), "this model may be incompatible")
			
 
				 
			
 
				 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return server, nil
			
 
				 	}
			
 
				-	s.load(req, ggml, gpus)
			
 
				+	s.load(req, ggml, gpus, 0)
			
 
				 	select {
			
 
				 	case err := <-req.errCh:
			
 
				 		require.NoError(t, err)
			
@@ -78,12 +78,12 @@ func TestLoad(t *testing.T) {
 
				 
			
 
				 	req.model.ModelPath = "dummy_model_path"
			
 
				 	server.waitResp = fmt.Errorf("wait failure")
			
 
				-	s.load(req, ggml, gpus)
			
 
				+	s.load(req, ggml, gpus, 0)
			
 
				 	select {
			
 
				 	case err := <-req.errCh:
			
 
				 		require.Contains(t, err.Error(), "wait failure")
			
 
				 	case resp := <-req.successCh:
			
 
				-		t.Errorf("unexpected success %v", resp)
			
 
				+		t.Fatalf("unexpected success %v", resp)
			
 
				 	}
			
 
				 	s.loadedMu.Lock()
			
 
				 	runner := s.loaded["dummy_model_path"]
			
@@ -102,7 +102,7 @@ type bundle struct {
 
				 	ggml    *llm.GGML
			
 
				 }
			
 
				 
			
 
				-func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
			
 
				+func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 	return scenario.srv, nil
			
 
				 }
			
 
				 
			
@@ -200,7 +200,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario1a.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 
			
 
				 	// Same runner as first request due to not needing a reload
			
@@ -213,7 +213,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario1b.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 
			
 
				 	// Trigger a reload
			
@@ -231,7 +231,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario2a.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 
			
 
				 	envconfig.MaxRunners = 1
			
@@ -247,7 +247,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario3a.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	s.loadedMu.Lock()
			
 
				 	require.Len(t, s.loaded, 1)
			
@@ -263,7 +263,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario3b.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	s.loadedMu.Lock()
			
 
				 	require.Len(t, s.loaded, 2)
			
@@ -279,7 +279,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario3c.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	s.loadedMu.Lock()
			
 
				 	require.Len(t, s.loaded, 3)
			
@@ -306,7 +306,7 @@ func TestRequests(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, scenario3d.req.errCh)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	s.loadedMu.Lock()
			
 
				 	require.Len(t, s.loaded, 2)
			
@@ -349,7 +349,7 @@ func TestGetRunner(t *testing.T) {
 
				 		require.Empty(t, s.pendingReqCh)
			
 
				 		require.Empty(t, errCh1a)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	scenario1a.ctxDone()
			
 
				 	s.loadedMu.Lock()
			
@@ -400,7 +400,7 @@ func TestPrematureExpired(t *testing.T) {
 
				 		slog.Info("sending premature expired event now")
			
 
				 		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	time.Sleep(scenario1a.req.sessionDuration)
			
 
				 	scenario1a.ctxDone()
			
@@ -427,7 +427,7 @@ func TestUseLoadedRunner(t *testing.T) {
 
				 	}
			
 
				 	finished := make(chan *LlmRequest)
			
 
				 	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	r1 := &runnerRef{llama: llm1, sessionDuration: 1}
			
 
				+	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
			
 
				 	req.useLoadedRunner(r1, finished)
			
 
				 	require.Equal(t, uint(1), r1.refCount)
			
 
				 	require.Equal(t, time.Duration(2), r1.sessionDuration)
			
@@ -435,7 +435,7 @@ func TestUseLoadedRunner(t *testing.T) {
 
				 	case success := <-req.successCh:
			
 
				 		require.Equal(t, r1, success)
			
 
				 	case <-ctx.Done():
			
 
				-		t.Errorf("timeout")
			
 
				+		t.Fatal("timeout")
			
 
				 	}
			
 
				 	done()
			
 
				 	fin := <-finished
			
@@ -461,8 +461,8 @@ func TestUpdateFreeSpace(t *testing.T) {
 
				 	gpus[1].FreeMemory = 1900
			
 
				 	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
			
 
				 	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
			
 
				-	r1 := &runnerRef{llama: llm1, gpus: gpus}
			
 
				-	r2 := &runnerRef{llama: llm2, gpus: gpus}
			
 
				+	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
			
 
				+	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
			
 
				 
			
 
				 	s := InitScheduler(ctx)
			
 
				 	s.loadedMu.Lock()
			
@@ -513,8 +513,8 @@ func TestFindRunnerToUnload(t *testing.T) {
 
				 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				 	defer done()
			
 
				 
			
 
				-	r1 := &runnerRef{refCount: 1, sessionDuration: 1}
			
 
				-	r2 := &runnerRef{sessionDuration: 2}
			
 
				+	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
			
 
				+	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
			
 
				 
			
 
				 	s := InitScheduler(ctx)
			
 
				 	s.loadedMu.Lock()
			
@@ -536,9 +536,13 @@ func TestNeedsReload(t *testing.T) {
 
				 	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
			
 
				 	do := api.DefaultOptions()
			
 
				 	runner := &runnerRef{
			
 
				-		model:   &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
			
 
				-		Options: &do,
			
 
				-		llama:   llm,
			
 
				+		model: &Model{
			
 
				+			AdapterPaths:   []string{"adapter1"},
			
 
				+			ProjectorPaths: []string{"projector1"},
			
 
				+		},
			
 
				+		Options:     &do,
			
 
				+		llama:       llm,
			
 
				+		numParallel: 1,
			
 
				 	}
			
 
				 	req := &LlmRequest{
			
 
				 		model: &Model{
			
@@ -581,8 +585,8 @@ func TestUnloadAllRunners(t *testing.T) {
 
				 	s := InitScheduler(ctx)
			
 
				 	s.unloadAllRunners()
			
 
				 
			
 
				-	r1 := &runnerRef{llama: llm1}
			
 
				-	r2 := &runnerRef{llama: llm2}
			
 
				+	r1 := &runnerRef{llama: llm1, numParallel: 1}
			
 
				+	r2 := &runnerRef{llama: llm2, numParallel: 1}
			
 
				 
			
 
				 	s.loadedMu.Lock()
			
 
				 	s.loaded["a"] = r1
			
@@ -596,14 +600,32 @@ func TestUnloadAllRunners(t *testing.T) {
 
				 
			
 
				 func TestUnload(t *testing.T) {
			
 
				 	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	r1 := &runnerRef{llama: llm1}
			
 
				-	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
			
 
				+	r1 := &runnerRef{llama: llm1, numParallel: 1}
			
 
				+	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
			
 
				 	r1.unload()
			
 
				 	require.True(t, llm1.closeCalled)
			
 
				 	r2.unload()
			
 
				 	require.Nil(t, r2.model)
			
 
				 }
			
 
				 
			
 
				+func TestAlreadyCanceled(t *testing.T) {
			
 
				+	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
			
 
				+	defer done()
			
 
				+	dctx, done2 := context.WithCancel(ctx)
			
 
				+	done2()
			
 
				+	scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
			
 
				+	scenario1a.req.sessionDuration = 0
			
 
				+	s := InitScheduler(ctx)
			
 
				+	slog.Info("scenario1a")
			
 
				+	s.pendingReqCh <- scenario1a.req
			
 
				+	require.Len(t, s.pendingReqCh, 1)
			
 
				+	s.Run(ctx)
			
 
				+	time.Sleep(5 * time.Millisecond)
			
 
				+	require.Empty(t, s.pendingReqCh)
			
 
				+	require.Empty(t, scenario1a.req.errCh)
			
 
				+	require.Empty(t, scenario1a.req.successCh)
			
 
				+}
			
 
				+
			
 
				 type mockLlm struct {
			
 
				 	pingResp           error
			
 
				 	waitResp           error