sched_test.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/format"
  14. "github.com/ollama/ollama/gpu"
  15. "github.com/ollama/ollama/llm"
  16. "github.com/stretchr/testify/assert"
  17. "github.com/stretchr/testify/require"
  18. )
  19. func init() {
  20. os.Setenv("OLLAMA_DEBUG", "1")
  21. lifecycle.InitLogging()
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. initialMax := loadedMax
  27. initialParallel := numParallel
  28. s := InitScheduler(ctx)
  29. require.Equal(t, initialMax, loadedMax)
  30. s.loadedMu.Lock()
  31. require.NotNil(t, s.loaded)
  32. s.loadedMu.Unlock()
  33. os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
  34. s = InitScheduler(ctx)
  35. require.Equal(t, initialMax, loadedMax)
  36. s.loadedMu.Lock()
  37. require.NotNil(t, s.loaded)
  38. s.loadedMu.Unlock()
  39. os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
  40. s = InitScheduler(ctx)
  41. require.Equal(t, 0, loadedMax)
  42. s.loadedMu.Lock()
  43. require.NotNil(t, s.loaded)
  44. s.loadedMu.Unlock()
  45. os.Setenv("OLLAMA_NUM_PARALLEL", "blue")
  46. _ = InitScheduler(ctx)
  47. require.Equal(t, initialParallel, numParallel)
  48. os.Setenv("OLLAMA_NUM_PARALLEL", "10")
  49. _ = InitScheduler(ctx)
  50. require.Equal(t, 10, numParallel)
  51. }
  52. func TestLoad(t *testing.T) {
  53. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  54. defer done()
  55. s := InitScheduler(ctx)
  56. var ggml *llm.GGML // value not used in tests
  57. req := &LlmRequest{
  58. ctx: ctx,
  59. model: &Model{ModelPath: "foo"},
  60. opts: api.DefaultOptions(),
  61. successCh: make(chan *runnerRef, 1),
  62. errCh: make(chan error, 1),
  63. sessionDuration: 2,
  64. }
  65. // Fail to load model first
  66. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  67. return nil, fmt.Errorf("something failed to load model blah")
  68. }
  69. gpus := gpu.GpuInfoList{}
  70. s.load(req, ggml, gpus)
  71. require.Len(t, req.successCh, 0)
  72. require.Len(t, req.errCh, 1)
  73. s.loadedMu.Lock()
  74. require.Len(t, s.loaded, 0)
  75. s.loadedMu.Unlock()
  76. err := <-req.errCh
  77. require.Contains(t, err.Error(), "this model may be incompatible")
  78. server := &mockLlm{estimatedVRAM: 10}
  79. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  80. return server, nil
  81. }
  82. s.load(req, ggml, gpus)
  83. select {
  84. case err := <-req.errCh:
  85. require.NoError(t, err)
  86. case resp := <-req.successCh:
  87. require.Equal(t, uint64(10), resp.estimatedVRAM)
  88. require.Equal(t, uint(1), resp.refCount)
  89. s.loadedMu.Lock()
  90. require.Len(t, s.loaded, 1)
  91. s.loadedMu.Unlock()
  92. }
  93. req.model.ModelPath = "dummy_model_path"
  94. server.waitResp = fmt.Errorf("wait failure")
  95. s.load(req, ggml, gpus)
  96. select {
  97. case err := <-req.errCh:
  98. require.Contains(t, err.Error(), "wait failure")
  99. case resp := <-req.successCh:
  100. t.Errorf("unexpected success %v", resp)
  101. }
  102. s.loadedMu.Lock()
  103. runner := s.loaded["dummy_model_path"]
  104. s.loadedMu.Unlock()
  105. require.NotNil(t, runner)
  106. require.Equal(t, uint(0), runner.refCount)
  107. time.Sleep(1 * time.Millisecond)
  108. require.Len(t, s.expiredCh, 1)
  109. }
  110. type bundle struct {
  111. ctx context.Context //nolint:containedctx
  112. ctxDone func()
  113. srv *mockLlm
  114. req *LlmRequest
  115. ggml *llm.GGML
  116. }
  117. func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  118. return scenario.srv, nil
  119. }
  120. func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
  121. scenario := &bundle{}
  122. scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
  123. t.Helper()
  124. f, err := os.CreateTemp(t.TempDir(), modelName)
  125. assert.Nil(t, err)
  126. defer f.Close()
  127. gguf := llm.NewGGUFV3(binary.LittleEndian)
  128. err = gguf.Encode(f, llm.KV{
  129. "general.architecture": "llama",
  130. "general.name": "name",
  131. "llama.context_length": uint32(32),
  132. "llama.embedding_length": uint32(4096),
  133. "llama.block_count": uint32(1),
  134. "llama.attention.head_count": uint32(32),
  135. "llama.attention.head_count_kv": uint32(32),
  136. "tokenizer.ggml.tokens": []string{" "},
  137. "tokenizer.ggml.scores": []float32{0},
  138. "tokenizer.ggml.token_type": []int32{0},
  139. }, []llm.Tensor{
  140. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  141. })
  142. assert.Nil(t, err)
  143. fname := f.Name()
  144. model := &Model{Name: modelName, ModelPath: fname}
  145. scenario.ggml, err = llm.LoadModel(model.ModelPath)
  146. require.NoError(t, err)
  147. scenario.req = &LlmRequest{
  148. ctx: scenario.ctx,
  149. model: model,
  150. opts: api.DefaultOptions(),
  151. sessionDuration: 5 * time.Millisecond,
  152. successCh: make(chan *runnerRef, 1),
  153. errCh: make(chan error, 1),
  154. }
  155. scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
  156. return scenario
  157. }
  158. func TestRequests(t *testing.T) {
  159. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  160. defer done()
  161. // Same model, same request
  162. scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
  163. scenario1a.req.sessionDuration = 0
  164. scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
  165. scenario1b.req.model = scenario1a.req.model
  166. scenario1b.ggml = scenario1a.ggml
  167. scenario1b.req.sessionDuration = 0
  168. // simple reload of same model
  169. scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
  170. scenario2a.req.model = scenario1a.req.model
  171. scenario2a.ggml = scenario1a.ggml
  172. // Multiple loaded models
  173. scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
  174. scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
  175. scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
  176. scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
  177. scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
  178. s := InitScheduler(ctx)
  179. s.getGpuFn = func() gpu.GpuInfoList {
  180. g := gpu.GpuInfo{Library: "metal"}
  181. g.TotalMemory = 24 * format.GigaByte
  182. g.FreeMemory = 12 * format.GigaByte
  183. return []gpu.GpuInfo{g}
  184. }
  185. s.newServerFn = scenario1a.newServer
  186. slog.Info("scenario1a")
  187. s.pendingReqCh <- scenario1a.req
  188. require.Len(t, s.pendingReqCh, 1)
  189. s.Run(ctx)
  190. select {
  191. case resp := <-scenario1a.req.successCh:
  192. require.Equal(t, resp.llama, scenario1a.srv)
  193. require.Len(t, s.pendingReqCh, 0)
  194. require.Len(t, scenario1a.req.errCh, 0)
  195. case <-ctx.Done():
  196. t.Errorf("timeout")
  197. }
  198. // Same runner as first request due to not needing a reload
  199. s.newServerFn = scenario1b.newServer
  200. slog.Info("scenario1b")
  201. s.pendingReqCh <- scenario1b.req
  202. select {
  203. case resp := <-scenario1b.req.successCh:
  204. require.Equal(t, resp.llama, scenario1a.srv)
  205. require.Len(t, s.pendingReqCh, 0)
  206. require.Len(t, scenario1b.req.errCh, 0)
  207. case <-ctx.Done():
  208. t.Errorf("timeout")
  209. }
  210. // Trigger a reload
  211. s.newServerFn = scenario2a.newServer
  212. scenario2a.req.model.AdapterPaths = []string{"new"}
  213. slog.Info("scenario2a")
  214. s.pendingReqCh <- scenario2a.req
  215. // finish first two requests, so model can reload
  216. time.Sleep(1 * time.Millisecond)
  217. scenario1a.ctxDone()
  218. scenario1b.ctxDone()
  219. select {
  220. case resp := <-scenario2a.req.successCh:
  221. require.Equal(t, resp.llama, scenario2a.srv)
  222. require.Len(t, s.pendingReqCh, 0)
  223. require.Len(t, scenario2a.req.errCh, 0)
  224. case <-ctx.Done():
  225. t.Errorf("timeout")
  226. }
  227. loadedMax = 1
  228. s.newServerFn = scenario3a.newServer
  229. slog.Info("scenario3a")
  230. s.pendingReqCh <- scenario3a.req
  231. // finish prior request, so new model can load
  232. time.Sleep(1 * time.Millisecond)
  233. scenario2a.ctxDone()
  234. select {
  235. case resp := <-scenario3a.req.successCh:
  236. require.Equal(t, resp.llama, scenario3a.srv)
  237. require.Len(t, s.pendingReqCh, 0)
  238. require.Len(t, scenario3a.req.errCh, 0)
  239. case <-ctx.Done():
  240. t.Errorf("timeout")
  241. }
  242. s.loadedMu.Lock()
  243. require.Len(t, s.loaded, 1)
  244. s.loadedMu.Unlock()
  245. loadedMax = 0
  246. s.newServerFn = scenario3b.newServer
  247. slog.Info("scenario3b")
  248. s.pendingReqCh <- scenario3b.req
  249. select {
  250. case resp := <-scenario3b.req.successCh:
  251. require.Equal(t, resp.llama, scenario3b.srv)
  252. require.Len(t, s.pendingReqCh, 0)
  253. require.Len(t, scenario3b.req.errCh, 0)
  254. case <-ctx.Done():
  255. t.Errorf("timeout")
  256. }
  257. s.loadedMu.Lock()
  258. require.Len(t, s.loaded, 2)
  259. s.loadedMu.Unlock()
  260. // This is a CPU load with NumGPU = 0 so it should load
  261. s.newServerFn = scenario3c.newServer
  262. slog.Info("scenario3c")
  263. s.pendingReqCh <- scenario3c.req
  264. select {
  265. case resp := <-scenario3c.req.successCh:
  266. require.Equal(t, resp.llama, scenario3c.srv)
  267. require.Len(t, s.pendingReqCh, 0)
  268. require.Len(t, scenario3c.req.errCh, 0)
  269. case <-ctx.Done():
  270. t.Errorf("timeout")
  271. }
  272. s.loadedMu.Lock()
  273. require.Len(t, s.loaded, 3)
  274. s.loadedMu.Unlock()
  275. // Try to load a model that wont fit
  276. s.newServerFn = scenario3d.newServer
  277. slog.Info("scenario3d")
  278. s.loadedMu.Lock()
  279. require.Len(t, s.loaded, 3)
  280. s.loadedMu.Unlock()
  281. scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
  282. time.Sleep(2 * time.Millisecond)
  283. s.pendingReqCh <- scenario3d.req
  284. // finish prior request, so new model can load
  285. time.Sleep(6 * time.Millisecond)
  286. s.loadedMu.Lock()
  287. require.Len(t, s.loaded, 2)
  288. s.loadedMu.Unlock()
  289. scenario3b.ctxDone()
  290. select {
  291. case resp := <-scenario3d.req.successCh:
  292. require.Equal(t, resp.llama, scenario3d.srv)
  293. require.Len(t, s.pendingReqCh, 0)
  294. require.Len(t, scenario3d.req.errCh, 0)
  295. case <-ctx.Done():
  296. t.Errorf("timeout")
  297. }
  298. s.loadedMu.Lock()
  299. require.Len(t, s.loaded, 2)
  300. s.loadedMu.Unlock()
  301. }
  302. func TestGetRunner(t *testing.T) {
  303. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  304. defer done()
  305. // Same model, same request
  306. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  307. scenario1a.req.sessionDuration = 0
  308. scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
  309. scenario1b.req.sessionDuration = 0
  310. scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
  311. scenario1c.req.sessionDuration = 0
  312. maxQueuedRequests = 1
  313. s := InitScheduler(ctx)
  314. s.getGpuFn = func() gpu.GpuInfoList {
  315. g := gpu.GpuInfo{Library: "metal"}
  316. g.TotalMemory = 24 * format.GigaByte
  317. g.FreeMemory = 12 * format.GigaByte
  318. return []gpu.GpuInfo{g}
  319. }
  320. s.newServerFn = scenario1a.newServer
  321. slog.Info("scenario1a")
  322. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  323. require.Len(t, s.pendingReqCh, 1)
  324. slog.Info("scenario1b")
  325. successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
  326. require.Len(t, s.pendingReqCh, 1)
  327. require.Len(t, successCh1b, 0)
  328. require.Len(t, errCh1b, 1)
  329. err := <-errCh1b
  330. require.Contains(t, err.Error(), "server busy")
  331. s.Run(ctx)
  332. select {
  333. case resp := <-successCh1a:
  334. require.Equal(t, resp.llama, scenario1a.srv)
  335. require.Len(t, s.pendingReqCh, 0)
  336. require.Len(t, errCh1a, 0)
  337. case <-ctx.Done():
  338. t.Errorf("timeout")
  339. }
  340. scenario1a.ctxDone()
  341. s.loadedMu.Lock()
  342. require.Len(t, s.loaded, 1)
  343. s.loadedMu.Unlock()
  344. scenario1c.req.model.ModelPath = "bad path"
  345. slog.Info("scenario1c")
  346. successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
  347. require.Len(t, s.pendingReqCh, 0)
  348. require.Len(t, successCh1c, 0)
  349. require.Len(t, errCh1c, 0)
  350. time.Sleep(5 * time.Millisecond)
  351. s.loadedMu.Lock()
  352. require.Len(t, s.loaded, 0)
  353. s.loadedMu.Unlock()
  354. require.Len(t, errCh1c, 1)
  355. err = <-errCh1c
  356. require.Contains(t, err.Error(), "bad path")
  357. scenario1b.ctxDone()
  358. }
  359. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  360. func TestPrematureExpired(t *testing.T) {
  361. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  362. defer done()
  363. // Same model, same request
  364. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  365. s := InitScheduler(ctx)
  366. s.getGpuFn = func() gpu.GpuInfoList {
  367. g := gpu.GpuInfo{Library: "metal"}
  368. g.TotalMemory = 24 * format.GigaByte
  369. g.FreeMemory = 12 * format.GigaByte
  370. return []gpu.GpuInfo{g}
  371. }
  372. s.newServerFn = scenario1a.newServer
  373. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  374. require.Len(t, s.pendingReqCh, 1)
  375. s.Run(ctx)
  376. select {
  377. case resp := <-successCh1a:
  378. require.Equal(t, resp.llama, scenario1a.srv)
  379. require.Len(t, s.pendingReqCh, 0)
  380. require.Len(t, errCh1a, 0)
  381. s.loadedMu.Lock()
  382. require.Len(t, s.loaded, 1)
  383. s.loadedMu.Unlock()
  384. slog.Info("sending premature expired event now")
  385. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  386. case <-ctx.Done():
  387. t.Errorf("timeout")
  388. }
  389. time.Sleep(scenario1a.req.sessionDuration)
  390. scenario1a.ctxDone()
  391. time.Sleep(20 * time.Millisecond)
  392. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  393. time.Sleep(10 * time.Millisecond)
  394. require.Len(t, s.finishedReqCh, 0)
  395. s.loadedMu.Lock()
  396. require.Len(t, s.loaded, 0)
  397. s.loadedMu.Unlock()
  398. // also shouldn't happen in real life
  399. s.finishedReqCh <- scenario1a.req
  400. time.Sleep(5 * time.Millisecond)
  401. }
  402. func TestUseLoadedRunner(t *testing.T) {
  403. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  404. req := &LlmRequest{
  405. ctx: ctx,
  406. opts: api.DefaultOptions(),
  407. successCh: make(chan *runnerRef, 1),
  408. sessionDuration: 2,
  409. }
  410. finished := make(chan *LlmRequest)
  411. llm1 := &mockLlm{}
  412. r1 := &runnerRef{llama: llm1, sessionDuration: 1}
  413. req.useLoadedRunner(r1, finished)
  414. require.Equal(t, uint(1), r1.refCount)
  415. require.Equal(t, time.Duration(2), r1.sessionDuration)
  416. select {
  417. case success := <-req.successCh:
  418. require.Equal(t, r1, success)
  419. case <-ctx.Done():
  420. t.Errorf("timeout")
  421. }
  422. done()
  423. fin := <-finished
  424. require.Equal(t, req, fin)
  425. }
  426. func TestUpdateFreeSpace(t *testing.T) {
  427. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  428. defer done()
  429. gpus := gpu.GpuInfoList{
  430. {
  431. Library: "a",
  432. ID: "1",
  433. },
  434. {
  435. Library: "a",
  436. ID: "2",
  437. },
  438. }
  439. gpus[0].TotalMemory = 1000
  440. gpus[0].FreeMemory = 900
  441. gpus[1].TotalMemory = 2000
  442. gpus[1].FreeMemory = 1900
  443. llm1 := &mockLlm{estimatedVRAM: 100}
  444. llm2 := &mockLlm{estimatedVRAM: 200}
  445. r1 := &runnerRef{llama: llm1, gpus: gpus}
  446. r2 := &runnerRef{llama: llm2, gpus: gpus}
  447. s := InitScheduler(ctx)
  448. s.loadedMu.Lock()
  449. s.loaded["a"] = r1
  450. s.loaded["b"] = r2
  451. s.loadedMu.Unlock()
  452. s.updateFreeSpace(gpus)
  453. require.Equal(t, uint64(850), gpus[0].FreeMemory)
  454. require.Equal(t, uint64(1850), gpus[1].FreeMemory)
  455. }
  456. func TestFindRunnerToUnload(t *testing.T) {
  457. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  458. defer done()
  459. req := &LlmRequest{
  460. ctx: ctx,
  461. opts: api.DefaultOptions(),
  462. }
  463. r1 := &runnerRef{refCount: 1, sessionDuration: 1}
  464. r2 := &runnerRef{sessionDuration: 2}
  465. s := InitScheduler(ctx)
  466. s.loadedMu.Lock()
  467. s.loaded["a"] = r1
  468. s.loaded["b"] = r2
  469. s.loadedMu.Unlock()
  470. resp := s.findRunnerToUnload(req)
  471. require.Equal(t, r2, resp)
  472. r2.refCount = 1
  473. resp = s.findRunnerToUnload(req)
  474. require.Equal(t, r1, resp)
  475. }
  476. func TestNeedsReload(t *testing.T) {
  477. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  478. defer done()
  479. llm := &mockLlm{}
  480. do := api.DefaultOptions()
  481. runner := &runnerRef{
  482. adapters: []string{"adapter1"},
  483. projectors: []string{"projector1"},
  484. Options: &do,
  485. llama: llm,
  486. }
  487. req := &LlmRequest{
  488. model: &Model{
  489. AdapterPaths: []string{"adapter2"},
  490. ProjectorPaths: []string{"projector2"},
  491. },
  492. opts: api.DefaultOptions(),
  493. }
  494. resp := runner.needsReload(ctx, req)
  495. require.True(t, resp)
  496. req.model.AdapterPaths = runner.adapters
  497. resp = runner.needsReload(ctx, req)
  498. require.True(t, resp)
  499. req.model.ProjectorPaths = runner.projectors
  500. runner.loading = true
  501. req.opts.NumBatch = 1234
  502. resp = runner.needsReload(ctx, req)
  503. require.True(t, resp)
  504. req.opts.NumBatch = runner.Options.NumBatch
  505. llm.pingResp = fmt.Errorf("foo")
  506. resp = runner.needsReload(ctx, req)
  507. require.True(t, resp)
  508. llm.pingResp = nil
  509. resp = runner.needsReload(ctx, req)
  510. require.False(t, resp)
  511. req.opts.NumGPU = 99
  512. resp = runner.needsReload(ctx, req)
  513. require.True(t, resp)
  514. req.opts.NumGPU = -1
  515. resp = runner.needsReload(ctx, req)
  516. require.False(t, resp)
  517. }
  518. func TestUnloadAllRunners(t *testing.T) {
  519. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  520. defer done()
  521. llm1 := &mockLlm{}
  522. llm2 := &mockLlm{}
  523. s := InitScheduler(ctx)
  524. s.unloadAllRunners()
  525. r1 := &runnerRef{llama: llm1}
  526. r2 := &runnerRef{llama: llm2}
  527. s.loadedMu.Lock()
  528. s.loaded["a"] = r1
  529. s.loaded["b"] = r2
  530. s.loadedMu.Unlock()
  531. s.unloadAllRunners()
  532. require.True(t, llm1.closeCalled)
  533. require.True(t, llm2.closeCalled)
  534. }
  535. func TestUnload(t *testing.T) {
  536. llm1 := &mockLlm{}
  537. r1 := &runnerRef{llama: llm1}
  538. r2 := &runnerRef{adapters: []string{"A"}}
  539. r1.unload()
  540. require.True(t, llm1.closeCalled)
  541. r2.unload()
  542. require.Nil(t, r2.adapters)
  543. }
  544. type mockLlm struct {
  545. pingResp error
  546. waitResp error
  547. completionResp error
  548. embeddingResp []float64
  549. embeddingRespErr error
  550. tokenizeResp []int
  551. tokenizeRespErr error
  552. detokenizeResp string
  553. detonekizeRespErr error
  554. closeResp error
  555. closeCalled bool
  556. estimatedVRAM uint64
  557. }
  558. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  559. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  560. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  561. return s.completionResp
  562. }
  563. func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
  564. return s.embeddingResp, s.embeddingRespErr
  565. }
  566. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  567. return s.tokenizeResp, s.tokenizeRespErr
  568. }
  569. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  570. return s.detokenizeResp, s.detonekizeRespErr
  571. }
  572. func (s *mockLlm) Close() error {
  573. s.closeCalled = true
  574. return s.closeResp
  575. }
  576. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }