9 months ago · 345420998e
--- a/server/sched.go
+++ b/server/sched.go
@@ -212,9 +212,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 					} else if loadedCount == 0 {
			
 
				 						// No models loaded. Load the model but prefer the best fit.
			
 
				 						slog.Debug("loading first model", "model", pending.model.ModelPath)
			
 
				-						g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
			
 
				+						g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel)
			
 
				 						if g != nil {
			
 
				 							gpus = g
			
 
				+						} else {
			
 
				+							// Only allow partial loads when this is the first model
			
 
				+							gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
			
 
				 						}
			
 
				 						s.loadFn(pending, ggml, gpus, numParallel)
			
 
				 						break
			
@@ -231,7 +234,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 
			
 
				 						// Update free memory from currently loaded models
			
 
				 						s.updateFreeSpace(availGpus)
			
 
				-						fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
			
 
				+						fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
			
 
				 						if fitGpus != nil {
			
 
				 							slog.Debug("new model fits with existing models, loading")
			
 
				 							s.loadFn(pending, ggml, fitGpus, numParallel)
			
@@ -668,11 +671,12 @@ func (a ByDuration) Less(i, j int) bool {
 
				 // func (a BySize) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
			
 
				 // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
			
 
				 
			
 
				-// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
			
 
				+// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
			
 
				+// The list of GPUs returned will always be the same brand (library)
			
 
				 // If the model can not be fit fully within the available GPU(s) nil is returned
			
 
				 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
			
 
				 // opts.NumCtx accordingly
			
 
				-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				 	var estimatedVRAM uint64
			
 
				 
			
 
				 	var numParallelToTry []int
			
@@ -723,6 +727,25 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				+// If multiple Libraries are detected, pick the Library which loads the most layers for the model
			
 
				+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				+	*numParallel = 1
			
 
				+	byLibrary := gpus.ByLibrary()
			
 
				+	if len(byLibrary) <= 1 {
			
 
				+		return gpus
			
 
				+	}
			
 
				+	var bestEstimate uint64
			
 
				+	var bestFit int
			
 
				+	for i, gl := range byLibrary {
			
 
				+		_, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
			
 
				+		if estimatedVRAM > bestEstimate {
			
 
				+			bestEstimate = estimatedVRAM
			
 
				+			bestFit = i
			
 
				+		}
			
 
				+	}
			
 
				+	return byLibrary[bestFit]
			
 
				+}
			
 
				+
			
 
				 // findRunnerToUnload finds a runner to unload to make room for a new model
			
 
				 func (s *Scheduler) findRunnerToUnload() *runnerRef {
			
 
				 	s.loadedMu.Lock()
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -666,6 +666,45 @@ func TestAlreadyCanceled(t *testing.T) {
 
				 	require.Empty(t, scenario1a.req.successCh)
			
 
				 }
			
 
				 
			
 
				+func TestHomogeneousGPUs(t *testing.T) {
			
 
				+	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				+	defer done()
			
 
				+	s := InitScheduler(ctx)
			
 
				+
			
 
				+	s.getGpuFn = func() gpu.GpuInfoList {
			
 
				+		// Set memory values to require the model to be spread
			
 
				+		gpus := []gpu.GpuInfo{
			
 
				+			{Library: "cuda"},
			
 
				+			{Library: "rocm"},
			
 
				+		}
			
 
				+		gpus[0].TotalMemory = 1 * format.GibiByte
			
 
				+		gpus[0].FreeMemory = 256 * format.MebiByte
			
 
				+		gpus[1].TotalMemory = 1 * format.GibiByte
			
 
				+		gpus[1].FreeMemory = 256 * format.MebiByte
			
 
				+		return gpus
			
 
				+	}
			
 
				+	s.getCpuFn = getCpuFn
			
 
				+	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
			
 
				+	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+		require.Len(t, gpus, 1)
			
 
				+		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
			
 
				+	}
			
 
				+	slog.Info("a")
			
 
				+	s.pendingReqCh <- a.req
			
 
				+	require.Len(t, s.pendingReqCh, 1)
			
 
				+	s.Run(ctx)
			
 
				+	select {
			
 
				+	case resp := <-a.req.successCh:
			
 
				+		require.Equal(t, resp.llama, a.srv)
			
 
				+		require.Empty(t, s.pendingReqCh)
			
 
				+		require.Empty(t, a.req.errCh)
			
 
				+	case err := <-a.req.errCh:
			
 
				+		t.Fatal(err.Error())
			
 
				+	case <-ctx.Done():
			
 
				+		t.Fatal("timeout")
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 type mockLlm struct {
			
 
				 	pingResp           error
			
 
				 	waitResp           error