Bladeren bron

Wait for GPU free memory reporting to converge

The GPU drivers take a while to update their free memory reporting, so we need
to wait until the values converge with what we're expecting before proceeding
to start another runner in order to get an accurate picture.
Daniel Hiltgen 11 maanden geleden
bovenliggende
commit
354ad9254e
2 gewijzigde bestanden met toevoegingen van 61 en 3 verwijderingen
  1. 3 3
      gpu/cpu_common.go
  2. 58 0
      server/sched.go

+ 3 - 3
gpu/cpu_common.go

@@ -8,14 +8,14 @@ import (
 
 
 func GetCPUVariant() string {
 func GetCPUVariant() string {
 	if cpu.X86.HasAVX2 {
 	if cpu.X86.HasAVX2 {
-		slog.Info("CPU has AVX2")
+		slog.Debug("CPU has AVX2")
 		return "avx2"
 		return "avx2"
 	}
 	}
 	if cpu.X86.HasAVX {
 	if cpu.X86.HasAVX {
-		slog.Info("CPU has AVX")
+		slog.Debug("CPU has AVX")
 		return "avx"
 		return "avx"
 	}
 	}
-	slog.Info("CPU does not have vector extensions")
+	slog.Debug("CPU does not have vector extensions")
 	// else LCD
 	// else LCD
 	return ""
 	return ""
 }
 }

+ 58 - 0
server/sched.go

@@ -265,11 +265,14 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 
 
 			s.loadedMu.Lock()
 			s.loadedMu.Lock()
 			slog.Debug("got lock to unload", "model", runner.model)
 			slog.Debug("got lock to unload", "model", runner.model)
+			finished := runner.waitForVRAMRecovery()
 			runner.unload()
 			runner.unload()
 			delete(s.loaded, runner.model)
 			delete(s.loaded, runner.model)
 			s.loadedMu.Unlock()
 			s.loadedMu.Unlock()
 			slog.Debug("runner released", "model", runner.model)
 			slog.Debug("runner released", "model", runner.model)
 			runner.refMu.Unlock()
 			runner.refMu.Unlock()
+
+			<-finished
 			slog.Debug("sending an unloaded event", "model", runner.model)
 			slog.Debug("sending an unloaded event", "model", runner.model)
 			s.unloadedCh <- struct{}{}
 			s.unloadedCh <- struct{}{}
 		}
 		}
@@ -465,6 +468,61 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
 	return false
 	return false
 }
 }
 
 
+// Free memory reporting on GPUs can lag for a while even after the runner
+// exits, so we have to keep checking until we see the available memory recover,
+// otherwise subsequent model loads will get far less layers loaded or worse
+// case, may completely fall back to CPU mode.
+// This routine must be called before the runner unloads so it can establish
+// a before and after GPU memory allocation.  The returned channel
+// will be notified when we're done waiting, or have timed out and should
+// proceed anyway
+func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
+	finished := make(chan interface{}, 1)
+
+	// CPU or Metal don't need checking, so no waiting required
+	if len(runner.gpus) == 1 && (runner.gpus[0].Library == "cpu" || runner.gpus[0].Library == "metal") {
+		finished <- struct{}{}
+		return finished
+	}
+	start := time.Now()
+
+	// Establish a baseline before we unload
+	gpusBefore := gpu.GetGPUInfo()
+	var totalMemoryBefore, freeMemoryBefore uint64
+	for _, gpu := range gpusBefore {
+		totalMemoryBefore += gpu.TotalMemory
+		freeMemoryBefore += gpu.FreeMemory
+	}
+	go func() {
+		expiresAt := start.Add(5 * time.Second) // typical convergence is 0.5-1.5s
+		ticker := time.NewTicker(250 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			<-ticker.C
+			if time.Now().After(expiresAt) {
+				slog.Warn("gpu VRAM usage didn't recover within timeout", "seconds", time.Since(start).Seconds())
+				finished <- struct{}{}
+			}
+
+			// Query GPUs, look for free to go back up
+			gpusNow := gpu.GetGPUInfo()
+			var totalMemoryNow, freeMemoryNow uint64
+			for _, gpu := range gpusNow {
+				totalMemoryNow += gpu.TotalMemory
+				freeMemoryNow += gpu.FreeMemory
+			}
+			// If we're within ~80% of the estimated memory usage recovered, bail out
+			if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 {
+				slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()))
+				finished <- struct{}{}
+				return
+			}
+		}
+	}()
+	return finished
+
+}
+
 type ByDuration []*runnerRef
 type ByDuration []*runnerRef
 
 
 func (a ByDuration) Len() int      { return len(a) }
 func (a ByDuration) Len() int      { return len(a) }