1 year ago · 36a6daccab
--- a/server/sched.go
+++ b/server/sched.go
@@ -123,36 +123,35 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						pending.useLoadedRunner(runner, s.finishedReqCh)
			
 
				 						break
			
 
				 					}
			
 
				-				} else if loadedCount == 0 {
			
 
				-					slog.Debug("loading first model", "model", pending.model.ModelPath)
			
 
				-					gpus := s.getGpuFn()
			
 
				-
			
 
				-					ggml, err := llm.LoadModel(pending.model.ModelPath)
			
 
				-					if err != nil {
			
 
				-						pending.errCh <- err
			
 
				-						break
			
 
				-					}
			
 
				-					g := pickBestFitGPUs(pending, ggml, gpus)
			
 
				-					if g != nil {
			
 
				-						gpus = g
			
 
				-					}
			
 
				-					s.loadFn(pending, ggml, gpus)
			
 
				-					break
			
 
				 				} else if loadedMax > 0 && loadedCount >= loadedMax {
			
 
				 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
			
 
				 					runnerToExpire = s.findRunnerToUnload(pending)
			
 
				 				} else {
			
 
				-					// More than one loaded model, so we have to see if the new one fits
			
 
				+					// Either no models are loaded or below loadedMax
			
 
				 					// Get a refreshed GPU list
			
 
				 					gpus := s.getGpuFn()
			
 
				-					// Update free memory from currently loaded models
			
 
				-					s.updateFreeSpace(gpus)
			
 
				 
			
 
				+					// Load model for fitting
			
 
				 					ggml, err := llm.LoadModel(pending.model.ModelPath)
			
 
				 					if err != nil {
			
 
				 						pending.errCh <- err
			
 
				 						break
			
 
				 					}
			
 
				+
			
 
				+					// No models loaded. Load the model but prefer the best fit.
			
 
				+					if loadedCount == 0 {
			
 
				+						slog.Debug("loading first model", "model", pending.model.ModelPath)
			
 
				+						g := pickBestFitGPUs(pending, ggml, gpus)
			
 
				+						if g != nil {
			
 
				+							gpus = g
			
 
				+						}
			
 
				+						s.loadFn(pending, ggml, gpus)
			
 
				+						break
			
 
				+					}
			
 
				+
			
 
				+					// More than one loaded model, so we have to see if the new one fits
			
 
				+					// Update free memory from currently loaded models
			
 
				+					s.updateFreeSpace(gpus)
			
 
				 					gpus = pickBestFitGPUs(pending, ggml, gpus)
			
 
				 					if gpus != nil {
			
 
				 						slog.Debug("new model fits with existing models, loading")
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -47,7 +47,7 @@ func TestLoad(t *testing.T) {
 
				 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
			
 
				 	defer done()
			
 
				 	s := InitScheduler(ctx)
			
 
				-	ggml := nil // value not used in tests
			
 
				+	var ggml *llm.GGML // value not used in tests
			
 
				 	req := &LlmRequest{
			
 
				 		ctx:             ctx,
			
 
				 		model:           &Model{ModelPath: "foo"},