|
@@ -17,8 +17,9 @@ import (
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/api"
|
|
"github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/discover"
|
|
"github.com/ollama/ollama/envconfig"
|
|
"github.com/ollama/ollama/envconfig"
|
|
|
|
+ "github.com/ollama/ollama/fileutils"
|
|
"github.com/ollama/ollama/format"
|
|
"github.com/ollama/ollama/format"
|
|
- "github.com/ollama/ollama/llm"
|
|
|
|
|
|
+ "github.com/ollama/ollama/runners"
|
|
)
|
|
)
|
|
|
|
|
|
type LlmRequest struct {
|
|
type LlmRequest struct {
|
|
@@ -41,8 +42,8 @@ type Scheduler struct {
|
|
loaded map[string]*runnerRef
|
|
loaded map[string]*runnerRef
|
|
loadedMu sync.Mutex
|
|
loadedMu sync.Mutex
|
|
|
|
|
|
- loadFn func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
|
|
|
|
- newServerFn func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
|
|
|
|
|
|
+ loadFn func(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int)
|
|
|
|
+ newServerFn func(gpus discover.GpuInfoList, model string, ggml *fileutils.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (runners.LLMServer, error)
|
|
getGpuFn func() discover.GpuInfoList
|
|
getGpuFn func() discover.GpuInfoList
|
|
getCpuFn func() discover.GpuInfoList
|
|
getCpuFn func() discover.GpuInfoList
|
|
reschedDelay time.Duration
|
|
reschedDelay time.Duration
|
|
@@ -68,7 +69,7 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|
expiredCh: make(chan *runnerRef, maxQueue),
|
|
expiredCh: make(chan *runnerRef, maxQueue),
|
|
unloadedCh: make(chan interface{}, maxQueue),
|
|
unloadedCh: make(chan interface{}, maxQueue),
|
|
loaded: make(map[string]*runnerRef),
|
|
loaded: make(map[string]*runnerRef),
|
|
- newServerFn: llm.NewLlamaServer,
|
|
|
|
|
|
+ newServerFn: runners.NewLlamaServer,
|
|
getGpuFn: discover.GetGPUInfo,
|
|
getGpuFn: discover.GetGPUInfo,
|
|
getCpuFn: discover.GetCPUInfo,
|
|
getCpuFn: discover.GetCPUInfo,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
reschedDelay: 250 * time.Millisecond,
|
|
@@ -187,7 +188,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|
}
|
|
}
|
|
|
|
|
|
// Load model for fitting
|
|
// Load model for fitting
|
|
- ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
|
|
|
|
|
+ ggml, err := fileutils.LoadModel(pending.model.ModelPath, 0)
|
|
if err != nil {
|
|
if err != nil {
|
|
pending.errCh <- err
|
|
pending.errCh <- err
|
|
break
|
|
break
|
|
@@ -409,7 +410,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
|
|
}()
|
|
}()
|
|
}
|
|
}
|
|
|
|
|
|
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
|
|
|
|
|
|
+func (s *Scheduler) load(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel int) {
|
|
if numParallel < 1 {
|
|
if numParallel < 1 {
|
|
numParallel = 1
|
|
numParallel = 1
|
|
}
|
|
}
|
|
@@ -422,7 +423,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoL
|
|
// some older models are not compatible with newer versions of llama.cpp
|
|
// some older models are not compatible with newer versions of llama.cpp
|
|
// show a generalized compatibility error until there is a better way to
|
|
// show a generalized compatibility error until there is a better way to
|
|
// check for model compatibility
|
|
// check for model compatibility
|
|
- if errors.Is(err, llm.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
|
|
|
|
|
+ if errors.Is(err, fileutils.ErrUnsupportedFormat) || strings.Contains(err.Error(), "failed to load model") {
|
|
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
|
err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
|
|
}
|
|
}
|
|
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
|
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
|
|
@@ -540,7 +541,7 @@ type runnerRef struct {
|
|
refCount uint // prevent unloading if > 0
|
|
refCount uint // prevent unloading if > 0
|
|
// unloading bool // set to true when we are trying to unload the runner
|
|
// unloading bool // set to true when we are trying to unload the runner
|
|
|
|
|
|
- llama llm.LlamaServer
|
|
|
|
|
|
+ llama runners.LLMServer
|
|
loading bool // True only during initial load, then false forever
|
|
loading bool // True only during initial load, then false forever
|
|
gpus discover.GpuInfoList // Recorded at time of provisioning
|
|
gpus discover.GpuInfoList // Recorded at time of provisioning
|
|
estimatedVRAM uint64
|
|
estimatedVRAM uint64
|
|
@@ -685,7 +686,7 @@ func (a ByDuration) Less(i, j int) bool {
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
// If the model can not be fit fully within the available GPU(s) nil is returned
|
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
|
// If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
|
|
// opts.NumCtx accordingly
|
|
// opts.NumCtx accordingly
|
|
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
|
|
|
|
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
var estimatedVRAM uint64
|
|
var estimatedVRAM uint64
|
|
|
|
|
|
var numParallelToTry []int
|
|
var numParallelToTry []int
|
|
@@ -710,7 +711,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
if !envconfig.SchedSpread() {
|
|
if !envconfig.SchedSpread() {
|
|
for _, g := range sgl {
|
|
for _, g := range sgl {
|
|
- if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
|
|
|
+ if ok, estimatedVRAM = fileutils.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
|
*numParallel = p
|
|
*numParallel = p
|
|
return []discover.GpuInfo{g}
|
|
return []discover.GpuInfo{g}
|
|
@@ -726,7 +727,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|
// Now try all the GPUs
|
|
// Now try all the GPUs
|
|
for _, p := range numParallelToTry {
|
|
for _, p := range numParallelToTry {
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
req.opts.NumCtx = req.origNumCtx * p
|
|
- if ok, estimatedVRAM = llm.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
|
|
|
|
+ if ok, estimatedVRAM = fileutils.PredictServerFit(sgl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
|
|
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
|
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
|
*numParallel = p
|
|
*numParallel = p
|
|
return sgl
|
|
return sgl
|
|
@@ -737,7 +738,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.Gpu
|
|
}
|
|
}
|
|
|
|
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
|
|
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
|
|
|
|
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
|
if *numParallel <= 0 {
|
|
if *numParallel <= 0 {
|
|
*numParallel = 1
|
|
*numParallel = 1
|
|
req.opts.NumCtx = req.origNumCtx
|
|
req.opts.NumCtx = req.origNumCtx
|
|
@@ -749,7 +750,7 @@ func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.
|
|
var bestEstimate uint64
|
|
var bestEstimate uint64
|
|
var bestFit int
|
|
var bestFit int
|
|
for i, gl := range byLibrary {
|
|
for i, gl := range byLibrary {
|
|
- _, estimatedVRAM := llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
|
|
|
|
|
+ _, estimatedVRAM := fileutils.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
|
|
if estimatedVRAM > bestEstimate {
|
|
if estimatedVRAM > bestEstimate {
|
|
bestEstimate = estimatedVRAM
|
|
bestEstimate = estimatedVRAM
|
|
bestFit = i
|
|
bestFit = i
|
|
@@ -822,9 +823,9 @@ func (s *Scheduler) expireRunner(model *Model) {
|
|
|
|
|
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
|
// If other runners are loaded, make sure the pending request will fit in system memory
|
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
|
// If not, pick a runner to unload, else return nil and the request can be loaded
|
|
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
|
|
|
|
|
|
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *fileutils.GGML, gpus discover.GpuInfoList) *runnerRef {
|
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
|
slog.Debug("evaluating if CPU model load will fit in available system memory")
|
|
- estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
|
|
|
|
|
+ estimate := fileutils.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
|
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
|
if estimate.TotalSize <= gpus[0].FreeMemory {
|
|
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
|
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
|
|
return nil
|
|
return nil
|