|
@@ -23,6 +23,7 @@ type LlmRequest struct {
|
|
|
ctx context.Context //nolint:containedctx
|
|
|
model *Model
|
|
|
opts api.Options
|
|
|
+ origNumCTX int // Track the initial ctx request
|
|
|
sessionDuration time.Duration
|
|
|
successCh chan *runnerRef
|
|
|
errCh chan error
|
|
@@ -38,13 +39,23 @@ type Scheduler struct {
|
|
|
loaded map[string]*runnerRef
|
|
|
loadedMu sync.Mutex
|
|
|
|
|
|
- loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
|
|
- newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
|
|
|
+ loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
|
|
|
+ newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
|
|
getGpuFn func() gpu.GpuInfoList
|
|
|
getCpuFn func() gpu.GpuInfoList
|
|
|
reschedDelay time.Duration
|
|
|
}
|
|
|
|
|
|
+// Default automatic value for number of models we allow per GPU
|
|
|
+// Model will still need to fit in VRAM, but loading many small models
|
|
|
+// on a large GPU can cause stalling
|
|
|
+var defaultModelsPerGPU = 3
|
|
|
+
|
|
|
+// Default automatic value for parallel setting
|
|
|
+// Model will still need to fit in VRAM. If this setting wont fit
|
|
|
+// we'll back off down to 1 to try to get it to fit
|
|
|
+var defaultParallel = 4
|
|
|
+
|
|
|
var ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
|
|
|
|
|
|
func InitScheduler(ctx context.Context) *Scheduler {
|
|
@@ -65,13 +76,10 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|
|
|
|
|
// context must be canceled to decrement ref count and release the runner
|
|
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
|
|
- // allocate a large enough kv cache for all parallel requests
|
|
|
if opts.NumCtx < 4 {
|
|
|
opts.NumCtx = 4
|
|
|
}
|
|
|
|
|
|
- opts.NumCtx *= envconfig.NumParallel
|
|
|
-
|
|
|
req := &LlmRequest{
|
|
|
ctx: c,
|
|
|
model: model,
|
|
@@ -110,11 +118,25 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
|
case pending := <-s.pendingReqCh:
|
|
|
// Block other requests until we get this pending request running
|
|
|
pending.schedAttempts++
|
|
|
+ if pending.origNumCTX == 0 {
|
|
|
+ pending.origNumCTX = pending.opts.NumCtx
|
|
|
+ }
|
|
|
|
|
|
if pending.ctx.Err() != nil {
|
|
|
slog.Debug("pending request cancelled or timed out, skipping scheduling")
|
|
|
continue
|
|
|
}
|
|
|
+ numParallel := envconfig.NumParallel
|
|
|
+ // TODO (jmorganca): multimodal models don't support parallel yet
|
|
|
+ // see https://github.com/ollama/ollama/issues/4165
|
|
|
+ if len(pending.model.ProjectorPaths) > 0 && numParallel != 1 {
|
|
|
+ numParallel = 1
|
|
|
+ slog.Warn("multimodal models don't support parallel requests yet")
|
|
|
+ }
|
|
|
+ // Keep NumCtx and numParallel in sync
|
|
|
+ if numParallel > 1 {
|
|
|
+ pending.opts.NumCtx = pending.origNumCTX * numParallel
|
|
|
+ }
|
|
|
|
|
|
for {
|
|
|
var runnerToExpire *runnerRef
|
|
@@ -143,6 +165,26 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
|
gpus = s.getGpuFn()
|
|
|
}
|
|
|
|
|
|
+ if envconfig.MaxRunners <= 0 {
|
|
|
+ // No user specified MaxRunners, so figure out what automatic setting to use
|
|
|
+ // If all GPUs have reliable free memory reporting, defaultModelsPerGPU * the number of GPUs
|
|
|
+ // if any GPU has unreliable free memory reporting, 1x the number of GPUs
|
|
|
+ allReliable := true
|
|
|
+ for _, gpu := range gpus {
|
|
|
+ if gpu.UnreliableFreeMemory {
|
|
|
+ allReliable = false
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if allReliable {
|
|
|
+ envconfig.MaxRunners = defaultModelsPerGPU * len(gpus)
|
|
|
+ slog.Debug("updating default concurrency", "OLLAMA_MAX_LOADED_MODELS", envconfig.MaxRunners, "gpu_count", len(gpus))
|
|
|
+ } else {
|
|
|
+ slog.Info("one or more GPUs detected that are unable to accurately report free memory - disabling default concurrency")
|
|
|
+ envconfig.MaxRunners = len(gpus)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
// Load model for fitting
|
|
|
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
|
|
if err != nil {
|
|
@@ -152,26 +194,32 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
|
|
|
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
|
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
|
|
+ // simplifying assumption of defaultParallel when in CPU mode
|
|
|
+ if numParallel <= 0 {
|
|
|
+ numParallel = defaultParallel
|
|
|
+ pending.opts.NumCtx = pending.origNumCTX * numParallel
|
|
|
+ }
|
|
|
+
|
|
|
if loadedCount == 0 {
|
|
|
slog.Debug("cpu mode with first model, loading")
|
|
|
- s.loadFn(pending, ggml, gpus)
|
|
|
+ s.loadFn(pending, ggml, gpus, numParallel)
|
|
|
break
|
|
|
}
|
|
|
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
|
|
|
if runnerToExpire == nil {
|
|
|
slog.Debug("cpu mode with available system memory or first model, loading")
|
|
|
- s.loadFn(pending, ggml, gpus)
|
|
|
+ s.loadFn(pending, ggml, gpus, numParallel)
|
|
|
break
|
|
|
}
|
|
|
// else we need to expire a runner
|
|
|
} else if loadedCount == 0 {
|
|
|
// No models loaded. Load the model but prefer the best fit.
|
|
|
slog.Debug("loading first model", "model", pending.model.ModelPath)
|
|
|
- g := pickBestFitGPUs(pending, ggml, gpus)
|
|
|
+ g := pickBestFitGPUs(pending, ggml, gpus, &numParallel)
|
|
|
if g != nil {
|
|
|
gpus = g
|
|
|
}
|
|
|
- s.loadFn(pending, ggml, gpus)
|
|
|
+ s.loadFn(pending, ggml, gpus, numParallel)
|
|
|
break
|
|
|
}
|
|
|
|
|
@@ -186,10 +234,10 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
|
|
|
|
// Update free memory from currently loaded models
|
|
|
s.updateFreeSpace(availGpus)
|
|
|
- fitGpus := pickBestFitGPUs(pending, ggml, availGpus)
|
|
|
+ fitGpus := pickBestFitGPUs(pending, ggml, availGpus, &numParallel)
|
|
|
if fitGpus != nil {
|
|
|
slog.Debug("new model fits with existing models, loading")
|
|
|
- s.loadFn(pending, ggml, fitGpus)
|
|
|
+ s.loadFn(pending, ggml, fitGpus, numParallel)
|
|
|
break
|
|
|
}
|
|
|
|
|
@@ -350,8 +398,11 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|
|
}()
|
|
|
}
|
|
|
|
|
|
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
|
|
|
- llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
|
|
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
|
|
+ if numParallel < 1 {
|
|
|
+ numParallel = 1
|
|
|
+ }
|
|
|
+ llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
|
|
|
if err != nil {
|
|
|
// some older models are not compatible with newer versions of llama.cpp
|
|
|
// show a generalized compatibility error until there is a better way to
|
|
@@ -375,6 +426,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
|
|
|
loading: true,
|
|
|
refCount: 1,
|
|
|
}
|
|
|
+ runner.numParallel = numParallel
|
|
|
runner.refMu.Lock()
|
|
|
|
|
|
s.loadedMu.Lock()
|
|
@@ -483,8 +535,9 @@ type runnerRef struct {
|
|
|
expireTimer *time.Timer
|
|
|
expiresAt time.Time
|
|
|
|
|
|
- model *Model
|
|
|
- modelPath string
|
|
|
+ model *Model
|
|
|
+ modelPath string
|
|
|
+ numParallel int
|
|
|
*api.Options
|
|
|
}
|
|
|
|
|
@@ -525,6 +578,9 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
|
|
|
optsNew.NumGPU = -1
|
|
|
}
|
|
|
|
|
|
+ // Normalize the NumCtx for parallelism
|
|
|
+ optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
|
|
|
+
|
|
|
ctx, cancel := context.WithTimeout(ctx, timeout)
|
|
|
defer cancel()
|
|
|
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
|
|
@@ -611,22 +667,38 @@ func (a ByDuration) Less(i, j int) bool {
|
|
|
|
|
|
// pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
|
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
|
-func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
|
|
|
+// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
|
|
+// opts.NumCtx accordingly
|
|
|
+func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
|
|
var estimatedVRAM uint64
|
|
|
+
|
|
|
+ var numParallelToTry []int
|
|
|
+ if *numParallel <= 0 {
|
|
|
+ // If no specific parallel setting was provided, try larger then smaller, always end with 1
|
|
|
+ numParallelToTry = append(numParallelToTry, defaultParallel, 1)
|
|
|
+ } else {
|
|
|
+ numParallelToTry = []int{*numParallel}
|
|
|
+ }
|
|
|
+
|
|
|
for _, gl := range gpus.ByLibrary() {
|
|
|
var ok bool
|
|
|
sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
|
|
|
|
|
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
|
|
+ // TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
|
|
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
|
|
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
|
|
|
|
|
// First attempt to fit the model into a single GPU
|
|
|
- if !envconfig.SchedSpread {
|
|
|
- for _, g := range sgl {
|
|
|
- if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
- slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
- return []gpu.GpuInfo{g}
|
|
|
+ for _, p := range numParallelToTry {
|
|
|
+ req.opts.NumCtx = req.origNumCTX * p
|
|
|
+ if !envconfig.SchedSpread {
|
|
|
+ for _, g := range sgl {
|
|
|
+ if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
+ slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
+ *numParallel = p
|
|
|
+ return []gpu.GpuInfo{g}
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -636,9 +708,13 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
|
|
|
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
|
|
|
|
|
// Now try all the GPUs
|
|
|
- if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
- slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
- return sgl
|
|
|
+ for _, p := range numParallelToTry {
|
|
|
+ req.opts.NumCtx = req.origNumCTX * p
|
|
|
+ if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
+ slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
|
|
+ *numParallel = p
|
|
|
+ return sgl
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
return nil
|