sched.go 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. package server
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "log/slog"
  7. "os"
  8. "reflect"
  9. "sort"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. "github.com/ollama/ollama/api"
  15. "github.com/ollama/ollama/format"
  16. "github.com/ollama/ollama/gpu"
  17. "github.com/ollama/ollama/llm"
  18. "golang.org/x/exp/slices"
  19. )
  20. type LlmRequest struct {
  21. ctx context.Context //nolint:containedctx
  22. model *Model
  23. opts api.Options
  24. sessionDuration time.Duration
  25. successCh chan *runnerRef
  26. errCh chan error
  27. }
  28. type Scheduler struct {
  29. pendingReqCh chan *LlmRequest
  30. finishedReqCh chan *LlmRequest
  31. expiredCh chan *runnerRef
  32. unloadedCh chan interface{}
  33. loaded map[string]*runnerRef
  34. loadedMu sync.Mutex
  35. loadFn func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList)
  36. newServerFn func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error)
  37. getGpuFn func() gpu.GpuInfoList
  38. }
  39. var (
  40. // TODO set this to zero after a release or two, to enable multiple models by default
  41. loadedMax = 1 // Maximum runners; < 1 maps to as many as will fit in VRAM (unlimited for CPU runners)
  42. maxQueuedRequests = 512
  43. numParallel = 1
  44. ErrMaxQueue = fmt.Errorf("server busy, please try again. maximum pending requests exceeded")
  45. )
  46. func InitScheduler(ctx context.Context) *Scheduler {
  47. maxRunners := os.Getenv("OLLAMA_MAX_LOADED_MODELS")
  48. if maxRunners != "" {
  49. m, err := strconv.Atoi(maxRunners)
  50. if err != nil {
  51. slog.Error("invalid setting", "OLLAMA_MAX_LOADED_MODELS", maxRunners, "error", err)
  52. } else {
  53. loadedMax = m
  54. }
  55. }
  56. if onp := os.Getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
  57. p, err := strconv.Atoi(onp)
  58. if err != nil || p <= 0 {
  59. slog.Error("invalid parallel setting, must be greater than zero", "OLLAMA_NUM_PARALLEL", onp, "error", err)
  60. } else {
  61. numParallel = p
  62. }
  63. }
  64. if onp := os.Getenv("OLLAMA_MAX_QUEUE"); onp != "" {
  65. p, err := strconv.Atoi(onp)
  66. if err != nil || p <= 0 {
  67. slog.Error("invalid setting", "OLLAMA_MAX_QUEUE", onp, "error", err)
  68. } else {
  69. maxQueuedRequests = p
  70. }
  71. }
  72. sched := &Scheduler{
  73. pendingReqCh: make(chan *LlmRequest, maxQueuedRequests),
  74. finishedReqCh: make(chan *LlmRequest, maxQueuedRequests),
  75. expiredCh: make(chan *runnerRef, maxQueuedRequests),
  76. unloadedCh: make(chan interface{}, maxQueuedRequests),
  77. loaded: make(map[string]*runnerRef),
  78. newServerFn: llm.NewLlamaServer,
  79. getGpuFn: gpu.GetGPUInfo,
  80. }
  81. sched.loadFn = sched.load
  82. return sched
  83. }
  84. // context must be canceled to decrement ref count and release the runner
  85. func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
  86. req := &LlmRequest{
  87. ctx: c,
  88. model: model,
  89. opts: opts,
  90. sessionDuration: sessionDuration,
  91. successCh: make(chan *runnerRef),
  92. errCh: make(chan error, 1),
  93. }
  94. // context split across parallel threads
  95. opts.NumCtx = opts.NumCtx * numParallel
  96. select {
  97. case s.pendingReqCh <- req:
  98. default:
  99. req.errCh <- ErrMaxQueue
  100. }
  101. return req.successCh, req.errCh
  102. }
  103. // Returns immediately, spawns go routines for the scheduler which will shutdown when ctx is done
  104. func (s *Scheduler) Run(ctx context.Context) {
  105. slog.Debug("starting llm scheduler")
  106. go func() {
  107. s.processPending(ctx)
  108. }()
  109. go func() {
  110. s.processCompleted(ctx)
  111. }()
  112. }
  113. func (s *Scheduler) processPending(ctx context.Context) {
  114. for {
  115. select {
  116. case <-ctx.Done():
  117. slog.Debug("shutting down scheduler pending loop")
  118. return
  119. case pending := <-s.pendingReqCh:
  120. // Block other requests until we get this pending request running
  121. for {
  122. var runnerToExpire *runnerRef
  123. s.loadedMu.Lock()
  124. runner := s.loaded[pending.model.ModelPath]
  125. loadedCount := len(s.loaded)
  126. s.loadedMu.Unlock()
  127. if runner != nil {
  128. if runner.needsReload(ctx, pending) {
  129. runnerToExpire = runner
  130. } else {
  131. // Runner is usable, return it
  132. pending.useLoadedRunner(runner, s.finishedReqCh)
  133. break
  134. }
  135. } else if loadedMax > 0 && loadedCount >= loadedMax {
  136. slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
  137. runnerToExpire = s.findRunnerToUnload(pending)
  138. } else {
  139. // Either no models are loaded or below loadedMax
  140. // Get a refreshed GPU list
  141. gpus := s.getGpuFn()
  142. // Load model for fitting
  143. ggml, err := llm.LoadModel(pending.model.ModelPath)
  144. if err != nil {
  145. pending.errCh <- err
  146. break
  147. }
  148. // If we're CPU only mode, just limit by loadedMax above
  149. // TODO handle system memory exhaustion
  150. if (len(gpus) == 1 && gpus[0].Library == "cpu") || pending.opts.NumGPU == 0 {
  151. slog.Debug("cpu mode with existing models, loading")
  152. s.loadFn(pending, ggml, gpus)
  153. break
  154. }
  155. // No models loaded. Load the model but prefer the best fit.
  156. if loadedCount == 0 {
  157. slog.Debug("loading first model", "model", pending.model.ModelPath)
  158. g := pickBestFitGPUs(pending, ggml, gpus)
  159. if g != nil {
  160. gpus = g
  161. }
  162. s.loadFn(pending, ggml, gpus)
  163. break
  164. }
  165. // More than one loaded model, so we have to see if the new one fits
  166. // Update free memory from currently loaded models
  167. s.updateFreeSpace(gpus)
  168. gpus = pickBestFitGPUs(pending, ggml, gpus)
  169. if gpus != nil {
  170. slog.Debug("new model fits with existing models, loading")
  171. s.loadFn(pending, ggml, gpus)
  172. break
  173. }
  174. runnerToExpire = s.findRunnerToUnload(pending)
  175. }
  176. if runnerToExpire == nil {
  177. // Shouildn't happen
  178. slog.Error("runner to expire was nil!")
  179. continue
  180. }
  181. // Trigger an expiration to unload once it's done
  182. runnerToExpire.refMu.Lock()
  183. slog.Debug("resetting model to expire immediately to make room", "model", runnerToExpire.model, "refCount", runnerToExpire.refCount)
  184. if runnerToExpire.expireTimer != nil {
  185. runnerToExpire.expireTimer.Stop()
  186. runnerToExpire.expireTimer = nil
  187. }
  188. runnerToExpire.sessionDuration = 0
  189. if runnerToExpire.refCount <= 0 {
  190. s.expiredCh <- runnerToExpire
  191. }
  192. runnerToExpire.refMu.Unlock()
  193. // Wait for the unload to happen
  194. // Note: at this point we're queueing up all incoming requests, even if they were for
  195. // a different model that's loaded and not scheduled to be removed.
  196. slog.Debug("waiting for pending requests to complete and unload to occur", "model", runnerToExpire.model)
  197. select {
  198. case <-ctx.Done():
  199. slog.Debug("shutting down scheduler pending loop")
  200. return
  201. case <-s.unloadedCh:
  202. slog.Debug("unload completed", "model", runnerToExpire.model)
  203. continue
  204. }
  205. }
  206. case <-s.unloadedCh:
  207. // An unload request when there are no pending request can be ignored
  208. slog.Debug("ignoring unload event with no pending requests")
  209. }
  210. }
  211. }
  212. func (s *Scheduler) processCompleted(ctx context.Context) {
  213. // Process completed requests, expired timers, and unloading models
  214. for {
  215. select {
  216. case <-ctx.Done():
  217. slog.Debug("shutting down scheduler completed loop")
  218. return
  219. case finished := <-s.finishedReqCh:
  220. s.loadedMu.Lock()
  221. runner := s.loaded[finished.model.ModelPath]
  222. s.loadedMu.Unlock()
  223. if runner == nil {
  224. slog.Error("finished requeset signal received after model unloaded", "model", finished.model.ModelPath)
  225. continue
  226. }
  227. runner.refMu.Lock()
  228. runner.refCount--
  229. if runner.refCount <= 0 {
  230. if runner.sessionDuration <= 0 {
  231. slog.Debug("runner with zero duration has gone idle, expiring to unload", "model", runner.model)
  232. if runner.expireTimer != nil {
  233. runner.expireTimer.Stop()
  234. runner.expireTimer = nil
  235. }
  236. s.expiredCh <- runner
  237. } else if runner.expireTimer == nil {
  238. slog.Debug("runner with non-zero duration has gone idle, adding timer", "model", runner.model, "duration", runner.sessionDuration)
  239. runner.expireTimer = time.AfterFunc(runner.sessionDuration, func() {
  240. slog.Debug("timer expired, expiring to unload", "model", runner.model)
  241. runner.refMu.Lock()
  242. defer runner.refMu.Unlock()
  243. if runner.expireTimer != nil {
  244. runner.expireTimer.Stop()
  245. runner.expireTimer = nil
  246. }
  247. s.expiredCh <- runner
  248. })
  249. } else {
  250. slog.Debug("runner with non-zero duration has gone idle, resetting timer", "model", runner.model, "duration", runner.sessionDuration)
  251. runner.expireTimer.Reset(runner.sessionDuration)
  252. }
  253. }
  254. slog.Debug("after processing request finished event", "model", runner.model, "refCount", runner.refCount)
  255. runner.refMu.Unlock()
  256. case runner := <-s.expiredCh:
  257. slog.Debug("runner expired event received", "model", runner.model)
  258. runner.refMu.Lock()
  259. if runner.refCount > 0 {
  260. // Shouldn't happen, but safeguard to ensure no leaked runners
  261. slog.Debug("expired event with positive ref count, retrying", "model", runner.model, "refCount", runner.refCount)
  262. go func(runner *runnerRef) {
  263. // We can't unload yet, but want to as soon as the current request completes
  264. // So queue up another expired event
  265. time.Sleep(10 * time.Millisecond)
  266. s.expiredCh <- runner
  267. }(runner)
  268. runner.refMu.Unlock()
  269. continue
  270. }
  271. slog.Debug("got lock to unload", "model", runner.model)
  272. runner.unload()
  273. s.loadedMu.Lock()
  274. delete(s.loaded, runner.model)
  275. s.loadedMu.Unlock()
  276. slog.Debug("runner released", "model", runner.model)
  277. runner.refMu.Unlock()
  278. slog.Debug("sending an unloaded event", "model", runner.model)
  279. s.unloadedCh <- struct{}{}
  280. }
  281. }
  282. }
  283. // Complete the pending request and send the runner back to the requester
  284. // Wires up a finished event after the request context is completed
  285. // Updates session duration, and resets expiration timer
  286. func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *LlmRequest) {
  287. runner.refMu.Lock()
  288. defer runner.refMu.Unlock()
  289. runner.refCount++
  290. if runner.expireTimer != nil {
  291. runner.expireTimer.Stop()
  292. runner.expireTimer = nil
  293. }
  294. runner.sessionDuration = pending.sessionDuration
  295. pending.successCh <- runner
  296. go func() {
  297. <-pending.ctx.Done()
  298. slog.Debug("context for request finished")
  299. finished <- pending
  300. }()
  301. }
  302. func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) {
  303. llama, err := s.newServerFn(gpus, req.model.ModelPath, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts)
  304. if err != nil {
  305. // some older models are not compatible with newer versions of llama.cpp
  306. // show a generalized compatibility error until there is a better way to
  307. // check for model compatibility
  308. if errors.Is(llm.ErrUnsupportedFormat, err) || strings.Contains(err.Error(), "failed to load model") {
  309. err = fmt.Errorf("%v: this model may be incompatible with your version of Ollama. If you previously pulled this model, try updating it by running `ollama pull %s`", err, req.model.ShortName)
  310. }
  311. slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
  312. req.errCh <- err
  313. return
  314. }
  315. runner := &runnerRef{}
  316. runner.model = req.model.ModelPath
  317. runner.adapters = req.model.AdapterPaths
  318. runner.projectors = req.model.ProjectorPaths
  319. runner.llama = llama
  320. runner.Options = &req.opts
  321. runner.sessionDuration = req.sessionDuration
  322. runner.gpus = gpus
  323. runner.estimatedVRAM = llama.EstimatedVRAM()
  324. runner.loading = true
  325. runner.refCount = 1
  326. runner.refMu.Lock()
  327. s.loadedMu.Lock()
  328. s.loaded[req.model.ModelPath] = runner
  329. slog.Info("loaded runners", "count", len(s.loaded))
  330. s.loadedMu.Unlock()
  331. go func() {
  332. defer runner.refMu.Unlock()
  333. if err = llama.WaitUntilRunning(req.ctx); err != nil {
  334. slog.Error("error loading llama server", "error", err)
  335. runner.refCount--
  336. req.errCh <- err
  337. slog.Debug("triggering expiration for failed load", "model", runner.model)
  338. s.expiredCh <- runner
  339. return
  340. }
  341. slog.Debug("finished setting up runner", "model", req.model.ModelPath)
  342. runner.loading = false
  343. go func() {
  344. <-req.ctx.Done()
  345. slog.Debug("context for request finished")
  346. s.finishedReqCh <- req
  347. }()
  348. req.successCh <- runner
  349. }()
  350. }
  351. func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
  352. type predKey struct {
  353. Library string
  354. ID string
  355. }
  356. predMap := map[predKey]uint64{} // Sum up the total predicted usage per GPU for all runners
  357. s.loadedMu.Lock()
  358. for _, r := range s.loaded {
  359. r.refMu.Lock()
  360. gpuIDs := make([]string, 0, len(r.gpus))
  361. if r.llama != nil {
  362. // TODO this should be broken down by GPU instead of assuming uniform spread
  363. estimatedVRAMPerGPU := r.llama.EstimatedVRAM() / uint64(len(r.gpus))
  364. for _, gpu := range r.gpus {
  365. gpuIDs = append(gpuIDs, gpu.ID)
  366. }
  367. for _, gpu := range allGpus {
  368. if slices.Contains(gpuIDs, gpu.ID) {
  369. predMap[predKey{gpu.Library, gpu.ID}] += estimatedVRAMPerGPU
  370. }
  371. }
  372. } else {
  373. slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
  374. }
  375. r.refMu.Unlock()
  376. }
  377. s.loadedMu.Unlock()
  378. // Now that we've summed up all the GPU usage predictions across all the loaded runners, update the gpu list
  379. for i := range allGpus {
  380. if p, ok := predMap[predKey{allGpus[i].Library, allGpus[i].ID}]; ok {
  381. slog.Debug("gpu reported", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "available", format.HumanBytes2(allGpus[i].FreeMemory))
  382. if p > allGpus[i].TotalMemory {
  383. // Shouldn't happen
  384. slog.Warn("predicted usage exceeds VRAM", "gpu", allGpus[i].ID, "totalMemory", allGpus[i].TotalMemory, "predicted", p)
  385. allGpus[i].FreeMemory = 0
  386. } else if (allGpus[i].TotalMemory - p) < allGpus[i].FreeMemory { // predicted free is smaller than reported free, use it
  387. // TODO maybe we should just always trust our numbers, since cuda's free memory reporting is laggy
  388. // and we might unload models we didn't actually need to. The risk is if some other GPU intensive app is loaded
  389. // after we start our first runner, then we'll never acount for that, so picking the smallest free value seems prudent.
  390. allGpus[i].FreeMemory = allGpus[i].TotalMemory - p
  391. }
  392. slog.Info("updated VRAM", "gpu", allGpus[i].ID, "library", allGpus[i].Library, "total", format.HumanBytes2(allGpus[i].TotalMemory), "available", format.HumanBytes2(allGpus[i].FreeMemory))
  393. }
  394. }
  395. }
  396. type runnerRef struct {
  397. refMu sync.Mutex
  398. // refCond sync.Cond // Signaled on transition from 1 -> 0 refCount
  399. refCount uint // prevent unloading if > 0
  400. // unloading bool // set to true when we are trying to unload the runner
  401. llama llm.LlamaServer
  402. loading bool // True only during initial load, then false forever
  403. gpus gpu.GpuInfoList // Recorded at time of provisioning
  404. estimatedVRAM uint64
  405. sessionDuration time.Duration
  406. expireTimer *time.Timer
  407. model string
  408. adapters []string
  409. projectors []string
  410. *api.Options
  411. }
  412. // The refMu must already be held when calling unload
  413. func (runner *runnerRef) unload() {
  414. if runner.expireTimer != nil {
  415. runner.expireTimer.Stop()
  416. runner.expireTimer = nil
  417. }
  418. if runner.llama != nil {
  419. runner.llama.Close()
  420. }
  421. runner.llama = nil
  422. runner.adapters = nil
  423. runner.projectors = nil
  424. runner.Options = nil
  425. runner.gpus = nil
  426. }
  427. func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool {
  428. slog.Debug("evaluating already loaded", "model", req.model.ModelPath)
  429. runner.refMu.Lock()
  430. defer runner.refMu.Unlock()
  431. timeout := 10 * time.Second
  432. if runner.loading {
  433. timeout = 2 * time.Minute // Initial load can take a long time for big models on slow systems...
  434. }
  435. // Don't reload runner if num_gpu=-1 was provided
  436. optsExisting := runner.Options.Runner
  437. optsNew := req.opts.Runner
  438. if optsNew.NumGPU < 0 {
  439. optsExisting.NumGPU = -1
  440. optsNew.NumGPU = -1
  441. }
  442. ctx, cancel := context.WithTimeout(ctx, timeout)
  443. defer cancel()
  444. if !reflect.DeepEqual(runner.adapters, req.model.AdapterPaths) || // have the adapters changed?
  445. !reflect.DeepEqual(runner.projectors, req.model.ProjectorPaths) || // have the projectors changed?
  446. !reflect.DeepEqual(optsExisting, optsNew) || // have the runner options changed?
  447. runner.llama.Ping(ctx) != nil {
  448. return true
  449. }
  450. return false
  451. }
  452. type ByDuration []*runnerRef
  453. func (a ByDuration) Len() int { return len(a) }
  454. func (a ByDuration) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  455. func (a ByDuration) Less(i, j int) bool {
  456. // uint64 to turn negative time (never unload) to largest
  457. return uint64(a[i].sessionDuration) < uint64(a[j].sessionDuration)
  458. }
  459. // TODO - future consideration to pick runners based on size
  460. // type BySize []*runnerRef
  461. // func (a BySize) Len() int { return len(a) }
  462. // func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  463. // func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM }
  464. // pickBestFitGPUs will try to find the optimal placement of the model in the available GPUs where the model fully fits
  465. // If the model can not be fit fully within the available GPU(s) nil is returned
  466. func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.GpuInfoList {
  467. var estimatedVRAM uint64
  468. for _, gl := range gpus.ByLibrary() {
  469. var ok bool
  470. sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
  471. // TODO - potentially sort by performance capability, existing models loaded, etc.
  472. // Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
  473. sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
  474. // First attempt to fit the model into a single GPU
  475. for _, g := range sgl {
  476. if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
  477. slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
  478. return []gpu.GpuInfo{g}
  479. }
  480. }
  481. // TODO future refinements
  482. // - if multiple Libraries, see if any single GPU in any Library will fit
  483. // - try subsets of GPUs instead of just falling back to 1 or all in a family
  484. // Now try all the GPUs
  485. if ok, estimatedVRAM = llm.PredictServerFit(gl, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
  486. slog.Debug("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", gl[0].Library, "required", format.HumanBytes2(estimatedVRAM))
  487. return gl
  488. }
  489. }
  490. return nil
  491. }
  492. // findRunnerToUnload finds a runner to unload to make room for a new model
  493. func (s *Scheduler) findRunnerToUnload(req *LlmRequest) *runnerRef {
  494. s.loadedMu.Lock()
  495. runnerList := make([]*runnerRef, 0, len(s.loaded))
  496. for _, r := range s.loaded {
  497. runnerList = append(runnerList, r)
  498. }
  499. s.loadedMu.Unlock()
  500. // In the future we can enhance the algorithm to be smarter about picking the optimal runner to unload
  501. // e.g., if we have multiple options, will one make room for the request?
  502. sort.Sort(ByDuration(runnerList))
  503. // First try to find a runner that's already idle
  504. for _, runner := range runnerList {
  505. runner.refMu.Lock()
  506. rc := runner.refCount
  507. runner.refMu.Unlock()
  508. if rc == 0 {
  509. slog.Debug("found an idle runner to unload")
  510. return runner
  511. }
  512. }
  513. // None appear idle, just wait for the one with the shortest duration
  514. slog.Debug("no idle runners, picking the shortest duration", "count", len(runnerList))
  515. return runnerList[0]
  516. }
  517. func (s *Scheduler) unloadAllRunners() {
  518. s.loadedMu.Lock()
  519. defer s.loadedMu.Unlock()
  520. for model, runner := range s.loaded {
  521. if runner.llama != nil {
  522. slog.Debug("shutting down runner", "model", model)
  523. runner.llama.Close()
  524. }
  525. }
  526. }