10 місяців тому · e4ff73297d
--- a/server/sched.go
+++ b/server/sched.go
@@ -133,10 +133,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 				numParallel = 1
			
 
				 				slog.Warn("multimodal models don't support parallel requests yet")
			
 
				 			}
			
 
				-			// Keep NumCtx and numParallel in sync
			
 
				-			if numParallel > 1 {
			
 
				-				pending.opts.NumCtx = pending.origNumCtx * numParallel
			
 
				-			}
			
 
				 
			
 
				 			for {
			
 
				 				cpus := s.getCpuFn()
			
@@ -234,9 +230,10 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						// simplifying assumption of defaultParallel when in CPU mode
			
 
				 						if numParallel <= 0 {
			
 
				 							numParallel = defaultParallel
			
 
				-							pending.opts.NumCtx = pending.origNumCtx * numParallel
			
 
				 						}
			
 
				 
			
 
				+						pending.opts.NumCtx = pending.origNumCtx * numParallel
			
 
				+
			
 
				 						if loadedCount == 0 {
			
 
				 							slog.Debug("cpu mode with first model, loading")
			
 
				 							s.loadFn(pending, ggml, gpus, numParallel)