|
@@ -15,9 +15,9 @@ import (
|
|
"time"
|
|
"time"
|
|
|
|
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/api"
|
|
|
|
+ "github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/envconfig"
|
|
"github.com/ollama/ollama/envconfig"
|
|
"github.com/ollama/ollama/format"
|
|
"github.com/ollama/ollama/format"
|
|
- "github.com/ollama/ollama/gpu"
|
|
|
|
"github.com/ollama/ollama/llm"
|
|
"github.com/ollama/ollama/llm"
|
|
)
|
|
)
|
|
|
|
|
|
@@ -41,10 +41,10 @@ type Scheduler struct {
|
|
loaded map[string]*runnerRef
|
|
loaded map[string]*runnerRef
|
|
loadedMu sync.Mutex
|
|
loadedMu sync.Mutex
|
|
|
|
|
|
- loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
|
|
|
|
- newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
|
|
|
- getGpuFn func() gpu.GpuInfoList
|
|
|
|
- getCpuFn func() gpu.GpuInfoList
|
|
|
|
|
|
+ loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
|
|
|
+ newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
|
|
|
+ getGpuFn func() discover.GpuInfoList
|
|
|
|
+ getCpuFn func() discover.GpuInfoList
|
|
reschedDelay time.Duration
|
|
reschedDelay time.Duration
|
|
}
|
|
}
|
|
|
|
|
|
@@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|
unloadedCh: make(chan interface{}, maxQueue),
|
|
unloadedCh: make(chan interface{}, maxQueue),
|
|
loaded: make(map[string]*runnerRef),
|
|
loaded: make(map[string]*runnerRef),
|
|
newServerFn: llm.NewLlamaServer,
|
|
newServerFn: llm.NewLlamaServer,
|
|
- getGpuFn: gpu.GetGPUInfo,
|
|
|
|
- getCpuFn: gpu.GetCPUInfo,
|
|
|
|
|
|
+ getGpuFn: discover.GetGPUInfo,
|
|
|
|
+ getCpuFn: discover.GetCPUInfo,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
}
|
|
}
|
|
sched.loadFn = sched.load
|
|
sched.loadFn = sched.load
|
|
@@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
} else {
|
|
} else {
|
|
// Either no models are loaded or below envconfig.MaxRunners
|
|
// Either no models are loaded or below envconfig.MaxRunners
|
|
// Get a refreshed GPU list
|
|
// Get a refreshed GPU list
|
|
- var gpus gpu.GpuInfoList
|
|
|
|
|
|
+ var gpus discover.GpuInfoList
|
|
if pending.opts.NumGPU == 0 {
|
|
if pending.opts.NumGPU == 0 {
|
|
gpus = s.getCpuFn()
|
|
gpus = s.getCpuFn()
|
|
} else {
|
|
} else {
|
|
@@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|
}()
|
|
}()
|
|
}
|
|
}
|
|
|
|
|
|
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
|
|
|
|
|
|
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
|
if numParallel < 1 {
|
|
if numParallel < 1 {
|
|
numParallel = 1
|
|
numParallel = 1
|
|
}
|
|
}
|
|
@@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
|
|
}()
|
|
}()
|
|
}
|
|
}
|
|
|
|
|
|
-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
|
|
|
|
|
+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
|
|
type predKey struct {
|
|
type predKey struct {
|
|
Library string
|
|
Library string
|
|
ID string
|
|
ID string
|
|
@@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
|
|
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
|
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
|
|
// This routine returns the set of GPUs that do not have an active loading model.
|
|
// This routine returns the set of GPUs that do not have an active loading model.
|
|
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
|
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
|
|
-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
|
|
|
|
- ret := append(gpu.GpuInfoList{}, allGpus...)
|
|
|
|
|
|
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
|
|
|
|
+ ret := append(discover.GpuInfoList{}, allGpus...)
|
|
s.loadedMu.Lock()
|
|
s.loadedMu.Lock()
|
|
defer s.loadedMu.Unlock()
|
|
defer s.loadedMu.Unlock()
|
|
for _, runner := range s.loaded {
|
|
for _, runner := range s.loaded {
|
|
@@ -541,8 +541,8 @@ type runnerRef struct {
|
|
// unloading bool // set to true when we are trying to unload the runner
|
|
// unloading bool // set to true when we are trying to unload the runner
|
|
|
|
|
|
llama llm.LlamaServer
|
|
llama llm.LlamaServer
|
|
- loading bool // True only during initial load, then false forever
|
|
|
|
- gpus gpu.GpuInfoList // Recorded at time of provisioning
|
|
|
|
|
|
+ loading bool // True only during initial load, then false forever
|
|
|
|
+ gpus discover.GpuInfoList // Recorded at time of provisioning
|
|
estimatedVRAM uint64
|
|
estimatedVRAM uint64
|
|
estimatedTotal uint64
|
|
estimatedTotal uint64
|
|
|
|
|
|
@@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|
start := time.Now()
|
|
start := time.Now()
|
|
|
|
|
|
// Establish a baseline before we unload
|
|
// Establish a baseline before we unload
|
|
- gpusBefore := gpu.GetGPUInfo()
|
|
|
|
|
|
+ gpusBefore := discover.GetGPUInfo()
|
|
var totalMemoryBefore, freeMemoryBefore uint64
|
|
var totalMemoryBefore, freeMemoryBefore uint64
|
|
for _, gpu := range gpusBefore {
|
|
for _, gpu := range gpusBefore {
|
|
totalMemoryBefore += gpu.TotalMemory
|
|
totalMemoryBefore += gpu.TotalMemory
|
|
@@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
|
|
}
|
|
}
|
|
|
|
|
|
// Query GPUs, look for free to go back up
|
|
// Query GPUs, look for free to go back up
|
|
- gpusNow := gpu.GetGPUInfo()
|
|
|
|
|
|
+ gpusNow := discover.GetGPUInfo()
|
|
var totalMemoryNow, freeMemoryNow uint64
|
|
var totalMemoryNow, freeMemoryNow uint64
|
|
for _, gpu := range gpusNow {
|
|
for _, gpu := range gpusNow {
|
|
totalMemoryNow += gpu.TotalMemory
|
|
totalMemoryNow += gpu.TotalMemory
|
|
@@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
|
// opts.NumCtx accordingly
|
|
// opts.NumCtx accordingly
|
|
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
|
|
|
|
|
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
var estimatedVRAM uint64
|
|
var estimatedVRAM uint64
|
|
|
|
|
|
var numParallelToTry []int
|
|
var numParallelToTry []int
|
|
@@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|
|
|
|
|
for _, gl := range gpus.ByLibrary() {
|
|
for _, gl := range gpus.ByLibrary() {
|
|
var ok bool
|
|
var ok bool
|
|
- sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
|
|
|
|
|
|
+ sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
|
|
|
|
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
|
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
|
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
|
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
|
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
|
- sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
|
|
|
|
|
|
+ sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
|
|
|
|
|
// First attempt to fit the model into a single GPU
|
|
// First attempt to fit the model into a single GPU
|
|
for _, p := range numParallelToTry {
|
|
for _, p := range numParallelToTry {
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
if !envconfig.SchedSpread() {
|
|
if !envconfig.SchedSpread() {
|
|
for _, g := range sgl {
|
|
for _, g := range sgl {
|
|
- if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
|
|
|
+ if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
*numParallel = p
|
|
*numParallel = p
|
|
- return []gpu.GpuInfo{g}
|
|
|
|
|
|
+ return []discover.GpuInfo{g}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
|
|
}
|
|
}
|
|
|
|
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
|
|
|
|
|
|
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
if *numParallel <= 0 {
|
|
if *numParallel <= 0 {
|
|
*numParallel = 1
|
|
*numParallel = 1
|
|
req.opts.NumCtx = req.origNumCtx
|
|
req.opts.NumCtx = req.origNumCtx
|
|
@@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
|
|
|
|
|
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
|
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
|
|
|
|
|
|
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
|
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
|
estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|