sched_test.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/format"
  14. "github.com/ollama/ollama/gpu"
  15. "github.com/ollama/ollama/llm"
  16. "github.com/ollama/ollama/server/envconfig"
  17. "github.com/stretchr/testify/assert"
  18. "github.com/stretchr/testify/require"
  19. )
  20. func init() {
  21. os.Setenv("OLLAMA_DEBUG", "1")
  22. lifecycle.InitLogging()
  23. }
  24. func TestInitScheduler(t *testing.T) {
  25. ctx, done := context.WithCancel(context.Background())
  26. defer done()
  27. s := InitScheduler(ctx)
  28. s.loadedMu.Lock()
  29. require.NotNil(t, s.loaded)
  30. s.loadedMu.Unlock()
  31. }
  32. func TestLoad(t *testing.T) {
  33. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  34. defer done()
  35. s := InitScheduler(ctx)
  36. var ggml *llm.GGML // value not used in tests
  37. req := &LlmRequest{
  38. ctx: ctx,
  39. model: &Model{ModelPath: "foo"},
  40. opts: api.DefaultOptions(),
  41. successCh: make(chan *runnerRef, 1),
  42. errCh: make(chan error, 1),
  43. sessionDuration: 2,
  44. }
  45. // Fail to load model first
  46. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  47. return nil, fmt.Errorf("something failed to load model blah")
  48. }
  49. gpus := gpu.GpuInfoList{}
  50. s.load(req, ggml, gpus)
  51. require.Len(t, req.successCh, 0)
  52. require.Len(t, req.errCh, 1)
  53. s.loadedMu.Lock()
  54. require.Len(t, s.loaded, 0)
  55. s.loadedMu.Unlock()
  56. err := <-req.errCh
  57. require.Contains(t, err.Error(), "this model may be incompatible")
  58. server := &mockLlm{estimatedVRAM: 10}
  59. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  60. return server, nil
  61. }
  62. s.load(req, ggml, gpus)
  63. select {
  64. case err := <-req.errCh:
  65. require.NoError(t, err)
  66. case resp := <-req.successCh:
  67. require.Equal(t, uint64(10), resp.estimatedVRAM)
  68. require.Equal(t, uint(1), resp.refCount)
  69. s.loadedMu.Lock()
  70. require.Len(t, s.loaded, 1)
  71. s.loadedMu.Unlock()
  72. }
  73. req.model.ModelPath = "dummy_model_path"
  74. server.waitResp = fmt.Errorf("wait failure")
  75. s.load(req, ggml, gpus)
  76. select {
  77. case err := <-req.errCh:
  78. require.Contains(t, err.Error(), "wait failure")
  79. case resp := <-req.successCh:
  80. t.Errorf("unexpected success %v", resp)
  81. }
  82. s.loadedMu.Lock()
  83. runner := s.loaded["dummy_model_path"]
  84. s.loadedMu.Unlock()
  85. require.NotNil(t, runner)
  86. require.Equal(t, uint(0), runner.refCount)
  87. time.Sleep(1 * time.Millisecond)
  88. require.Len(t, s.expiredCh, 1)
  89. }
  90. type bundle struct {
  91. ctx context.Context //nolint:containedctx
  92. ctxDone func()
  93. srv *mockLlm
  94. req *LlmRequest
  95. ggml *llm.GGML
  96. }
  97. func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  98. return scenario.srv, nil
  99. }
  100. func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
  101. scenario := &bundle{}
  102. scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
  103. t.Helper()
  104. f, err := os.CreateTemp(t.TempDir(), modelName)
  105. assert.Nil(t, err)
  106. defer f.Close()
  107. gguf := llm.NewGGUFV3(binary.LittleEndian)
  108. err = gguf.Encode(f, llm.KV{
  109. "general.architecture": "llama",
  110. "general.name": "name",
  111. "llama.context_length": uint32(32),
  112. "llama.embedding_length": uint32(4096),
  113. "llama.block_count": uint32(1),
  114. "llama.attention.head_count": uint32(32),
  115. "llama.attention.head_count_kv": uint32(32),
  116. "tokenizer.ggml.tokens": []string{" "},
  117. "tokenizer.ggml.scores": []float32{0},
  118. "tokenizer.ggml.token_type": []int32{0},
  119. }, []llm.Tensor{
  120. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  121. })
  122. assert.Nil(t, err)
  123. fname := f.Name()
  124. model := &Model{Name: modelName, ModelPath: fname}
  125. scenario.ggml, err = llm.LoadModel(model.ModelPath)
  126. require.NoError(t, err)
  127. scenario.req = &LlmRequest{
  128. ctx: scenario.ctx,
  129. model: model,
  130. opts: api.DefaultOptions(),
  131. sessionDuration: 5 * time.Millisecond,
  132. successCh: make(chan *runnerRef, 1),
  133. errCh: make(chan error, 1),
  134. }
  135. scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
  136. return scenario
  137. }
  138. func TestRequests(t *testing.T) {
  139. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  140. defer done()
  141. // Same model, same request
  142. scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
  143. scenario1a.req.sessionDuration = 0
  144. scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
  145. scenario1b.req.model = scenario1a.req.model
  146. scenario1b.ggml = scenario1a.ggml
  147. scenario1b.req.sessionDuration = 0
  148. // simple reload of same model
  149. scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
  150. tmpModel := *scenario1a.req.model
  151. scenario2a.req.model = &tmpModel
  152. scenario2a.ggml = scenario1a.ggml
  153. // Multiple loaded models
  154. scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
  155. scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
  156. scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
  157. scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
  158. scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
  159. s := InitScheduler(ctx)
  160. s.getGpuFn = func() gpu.GpuInfoList {
  161. g := gpu.GpuInfo{Library: "metal"}
  162. g.TotalMemory = 24 * format.GigaByte
  163. g.FreeMemory = 12 * format.GigaByte
  164. return []gpu.GpuInfo{g}
  165. }
  166. s.newServerFn = scenario1a.newServer
  167. slog.Info("scenario1a")
  168. s.pendingReqCh <- scenario1a.req
  169. require.Len(t, s.pendingReqCh, 1)
  170. s.Run(ctx)
  171. select {
  172. case resp := <-scenario1a.req.successCh:
  173. require.Equal(t, resp.llama, scenario1a.srv)
  174. require.Len(t, s.pendingReqCh, 0)
  175. require.Len(t, scenario1a.req.errCh, 0)
  176. case <-ctx.Done():
  177. t.Errorf("timeout")
  178. }
  179. // Same runner as first request due to not needing a reload
  180. s.newServerFn = scenario1b.newServer
  181. slog.Info("scenario1b")
  182. s.pendingReqCh <- scenario1b.req
  183. select {
  184. case resp := <-scenario1b.req.successCh:
  185. require.Equal(t, resp.llama, scenario1a.srv)
  186. require.Len(t, s.pendingReqCh, 0)
  187. require.Len(t, scenario1b.req.errCh, 0)
  188. case <-ctx.Done():
  189. t.Errorf("timeout")
  190. }
  191. // Trigger a reload
  192. s.newServerFn = scenario2a.newServer
  193. scenario2a.req.model.AdapterPaths = []string{"new"}
  194. slog.Info("scenario2a")
  195. s.pendingReqCh <- scenario2a.req
  196. // finish first two requests, so model can reload
  197. time.Sleep(1 * time.Millisecond)
  198. scenario1a.ctxDone()
  199. scenario1b.ctxDone()
  200. select {
  201. case resp := <-scenario2a.req.successCh:
  202. require.Equal(t, resp.llama, scenario2a.srv)
  203. require.Len(t, s.pendingReqCh, 0)
  204. require.Len(t, scenario2a.req.errCh, 0)
  205. case <-ctx.Done():
  206. t.Errorf("timeout")
  207. }
  208. envconfig.MaxRunners = 1
  209. s.newServerFn = scenario3a.newServer
  210. slog.Info("scenario3a")
  211. s.pendingReqCh <- scenario3a.req
  212. // finish prior request, so new model can load
  213. time.Sleep(1 * time.Millisecond)
  214. scenario2a.ctxDone()
  215. select {
  216. case resp := <-scenario3a.req.successCh:
  217. require.Equal(t, resp.llama, scenario3a.srv)
  218. require.Len(t, s.pendingReqCh, 0)
  219. require.Len(t, scenario3a.req.errCh, 0)
  220. case <-ctx.Done():
  221. t.Errorf("timeout")
  222. }
  223. s.loadedMu.Lock()
  224. require.Len(t, s.loaded, 1)
  225. s.loadedMu.Unlock()
  226. envconfig.MaxRunners = 0
  227. s.newServerFn = scenario3b.newServer
  228. slog.Info("scenario3b")
  229. s.pendingReqCh <- scenario3b.req
  230. select {
  231. case resp := <-scenario3b.req.successCh:
  232. require.Equal(t, resp.llama, scenario3b.srv)
  233. require.Len(t, s.pendingReqCh, 0)
  234. require.Len(t, scenario3b.req.errCh, 0)
  235. case <-ctx.Done():
  236. t.Errorf("timeout")
  237. }
  238. s.loadedMu.Lock()
  239. require.Len(t, s.loaded, 2)
  240. s.loadedMu.Unlock()
  241. // This is a CPU load with NumGPU = 0 so it should load
  242. s.newServerFn = scenario3c.newServer
  243. slog.Info("scenario3c")
  244. s.pendingReqCh <- scenario3c.req
  245. select {
  246. case resp := <-scenario3c.req.successCh:
  247. require.Equal(t, resp.llama, scenario3c.srv)
  248. require.Len(t, s.pendingReqCh, 0)
  249. require.Len(t, scenario3c.req.errCh, 0)
  250. case <-ctx.Done():
  251. t.Errorf("timeout")
  252. }
  253. s.loadedMu.Lock()
  254. require.Len(t, s.loaded, 3)
  255. s.loadedMu.Unlock()
  256. // Try to load a model that wont fit
  257. s.newServerFn = scenario3d.newServer
  258. slog.Info("scenario3d")
  259. s.loadedMu.Lock()
  260. require.Len(t, s.loaded, 3)
  261. s.loadedMu.Unlock()
  262. scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
  263. time.Sleep(2 * time.Millisecond)
  264. s.pendingReqCh <- scenario3d.req
  265. // finish prior request, so new model can load
  266. time.Sleep(6 * time.Millisecond)
  267. s.loadedMu.Lock()
  268. require.Len(t, s.loaded, 2)
  269. s.loadedMu.Unlock()
  270. scenario3b.ctxDone()
  271. select {
  272. case resp := <-scenario3d.req.successCh:
  273. require.Equal(t, resp.llama, scenario3d.srv)
  274. require.Len(t, s.pendingReqCh, 0)
  275. require.Len(t, scenario3d.req.errCh, 0)
  276. case <-ctx.Done():
  277. t.Errorf("timeout")
  278. }
  279. s.loadedMu.Lock()
  280. require.Len(t, s.loaded, 2)
  281. s.loadedMu.Unlock()
  282. }
  283. func TestGetRunner(t *testing.T) {
  284. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  285. defer done()
  286. // Same model, same request
  287. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  288. scenario1a.req.sessionDuration = 0
  289. scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
  290. scenario1b.req.sessionDuration = 0
  291. scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
  292. scenario1c.req.sessionDuration = 0
  293. envconfig.MaxQueuedRequests = 1
  294. s := InitScheduler(ctx)
  295. s.getGpuFn = func() gpu.GpuInfoList {
  296. g := gpu.GpuInfo{Library: "metal"}
  297. g.TotalMemory = 24 * format.GigaByte
  298. g.FreeMemory = 12 * format.GigaByte
  299. return []gpu.GpuInfo{g}
  300. }
  301. s.newServerFn = scenario1a.newServer
  302. slog.Info("scenario1a")
  303. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  304. require.Len(t, s.pendingReqCh, 1)
  305. slog.Info("scenario1b")
  306. successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
  307. require.Len(t, s.pendingReqCh, 1)
  308. require.Len(t, successCh1b, 0)
  309. require.Len(t, errCh1b, 1)
  310. err := <-errCh1b
  311. require.Contains(t, err.Error(), "server busy")
  312. s.Run(ctx)
  313. select {
  314. case resp := <-successCh1a:
  315. require.Equal(t, resp.llama, scenario1a.srv)
  316. require.Len(t, s.pendingReqCh, 0)
  317. require.Len(t, errCh1a, 0)
  318. case <-ctx.Done():
  319. t.Errorf("timeout")
  320. }
  321. scenario1a.ctxDone()
  322. s.loadedMu.Lock()
  323. require.Len(t, s.loaded, 1)
  324. s.loadedMu.Unlock()
  325. scenario1c.req.model.ModelPath = "bad path"
  326. slog.Info("scenario1c")
  327. successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
  328. // Starts in pending channel, then should be quickly processsed to return an error
  329. time.Sleep(5 * time.Millisecond)
  330. require.Len(t, successCh1c, 0)
  331. s.loadedMu.Lock()
  332. require.Len(t, s.loaded, 0)
  333. s.loadedMu.Unlock()
  334. require.Len(t, errCh1c, 1)
  335. err = <-errCh1c
  336. require.Contains(t, err.Error(), "bad path")
  337. scenario1b.ctxDone()
  338. }
  339. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  340. func TestPrematureExpired(t *testing.T) {
  341. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  342. defer done()
  343. // Same model, same request
  344. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  345. s := InitScheduler(ctx)
  346. s.getGpuFn = func() gpu.GpuInfoList {
  347. g := gpu.GpuInfo{Library: "metal"}
  348. g.TotalMemory = 24 * format.GigaByte
  349. g.FreeMemory = 12 * format.GigaByte
  350. return []gpu.GpuInfo{g}
  351. }
  352. s.newServerFn = scenario1a.newServer
  353. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  354. require.Len(t, s.pendingReqCh, 1)
  355. s.Run(ctx)
  356. select {
  357. case resp := <-successCh1a:
  358. require.Equal(t, resp.llama, scenario1a.srv)
  359. require.Len(t, s.pendingReqCh, 0)
  360. require.Len(t, errCh1a, 0)
  361. s.loadedMu.Lock()
  362. require.Len(t, s.loaded, 1)
  363. s.loadedMu.Unlock()
  364. slog.Info("sending premature expired event now")
  365. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  366. case <-ctx.Done():
  367. t.Errorf("timeout")
  368. }
  369. time.Sleep(scenario1a.req.sessionDuration)
  370. scenario1a.ctxDone()
  371. time.Sleep(20 * time.Millisecond)
  372. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  373. time.Sleep(10 * time.Millisecond)
  374. require.Len(t, s.finishedReqCh, 0)
  375. s.loadedMu.Lock()
  376. require.Len(t, s.loaded, 0)
  377. s.loadedMu.Unlock()
  378. // also shouldn't happen in real life
  379. s.finishedReqCh <- scenario1a.req
  380. time.Sleep(5 * time.Millisecond)
  381. }
  382. func TestUseLoadedRunner(t *testing.T) {
  383. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  384. req := &LlmRequest{
  385. ctx: ctx,
  386. opts: api.DefaultOptions(),
  387. successCh: make(chan *runnerRef, 1),
  388. sessionDuration: 2,
  389. }
  390. finished := make(chan *LlmRequest)
  391. llm1 := &mockLlm{}
  392. r1 := &runnerRef{llama: llm1, sessionDuration: 1}
  393. req.useLoadedRunner(r1, finished)
  394. require.Equal(t, uint(1), r1.refCount)
  395. require.Equal(t, time.Duration(2), r1.sessionDuration)
  396. select {
  397. case success := <-req.successCh:
  398. require.Equal(t, r1, success)
  399. case <-ctx.Done():
  400. t.Errorf("timeout")
  401. }
  402. done()
  403. fin := <-finished
  404. require.Equal(t, req, fin)
  405. }
  406. func TestUpdateFreeSpace(t *testing.T) {
  407. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  408. defer done()
  409. gpus := gpu.GpuInfoList{
  410. {
  411. Library: "a",
  412. ID: "1",
  413. },
  414. {
  415. Library: "a",
  416. ID: "2",
  417. },
  418. }
  419. gpus[0].TotalMemory = 1000
  420. gpus[0].FreeMemory = 900
  421. gpus[1].TotalMemory = 2000
  422. gpus[1].FreeMemory = 1900
  423. llm1 := &mockLlm{estimatedVRAM: 100}
  424. llm2 := &mockLlm{estimatedVRAM: 200}
  425. r1 := &runnerRef{llama: llm1, gpus: gpus}
  426. r2 := &runnerRef{llama: llm2, gpus: gpus}
  427. s := InitScheduler(ctx)
  428. s.loadedMu.Lock()
  429. s.loaded["a"] = r1
  430. s.loaded["b"] = r2
  431. s.loadedMu.Unlock()
  432. s.updateFreeSpace(gpus)
  433. require.Equal(t, uint64(850), gpus[0].FreeMemory)
  434. require.Equal(t, uint64(1850), gpus[1].FreeMemory)
  435. }
  436. func TestFindRunnerToUnload(t *testing.T) {
  437. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  438. defer done()
  439. r1 := &runnerRef{refCount: 1, sessionDuration: 1}
  440. r2 := &runnerRef{sessionDuration: 2}
  441. s := InitScheduler(ctx)
  442. s.loadedMu.Lock()
  443. s.loaded["a"] = r1
  444. s.loaded["b"] = r2
  445. s.loadedMu.Unlock()
  446. resp := s.findRunnerToUnload()
  447. require.Equal(t, r2, resp)
  448. r2.refCount = 1
  449. resp = s.findRunnerToUnload()
  450. require.Equal(t, r1, resp)
  451. }
  452. func TestNeedsReload(t *testing.T) {
  453. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  454. defer done()
  455. llm := &mockLlm{}
  456. do := api.DefaultOptions()
  457. runner := &runnerRef{
  458. model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
  459. Options: &do,
  460. llama: llm,
  461. }
  462. req := &LlmRequest{
  463. model: &Model{
  464. AdapterPaths: []string{"adapter2"},
  465. ProjectorPaths: []string{"projector2"},
  466. },
  467. opts: api.DefaultOptions(),
  468. }
  469. resp := runner.needsReload(ctx, req)
  470. require.True(t, resp)
  471. req.model.AdapterPaths = runner.model.AdapterPaths
  472. resp = runner.needsReload(ctx, req)
  473. require.True(t, resp)
  474. req.model.ProjectorPaths = runner.model.ProjectorPaths
  475. runner.loading = true
  476. req.opts.NumBatch = 1234
  477. resp = runner.needsReload(ctx, req)
  478. require.True(t, resp)
  479. req.opts.NumBatch = runner.Options.NumBatch
  480. llm.pingResp = fmt.Errorf("foo")
  481. resp = runner.needsReload(ctx, req)
  482. require.True(t, resp)
  483. llm.pingResp = nil
  484. resp = runner.needsReload(ctx, req)
  485. require.False(t, resp)
  486. req.opts.NumGPU = 99
  487. resp = runner.needsReload(ctx, req)
  488. require.True(t, resp)
  489. req.opts.NumGPU = -1
  490. resp = runner.needsReload(ctx, req)
  491. require.False(t, resp)
  492. }
  493. func TestUnloadAllRunners(t *testing.T) {
  494. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  495. defer done()
  496. llm1 := &mockLlm{}
  497. llm2 := &mockLlm{}
  498. s := InitScheduler(ctx)
  499. s.unloadAllRunners()
  500. r1 := &runnerRef{llama: llm1}
  501. r2 := &runnerRef{llama: llm2}
  502. s.loadedMu.Lock()
  503. s.loaded["a"] = r1
  504. s.loaded["b"] = r2
  505. s.loadedMu.Unlock()
  506. s.unloadAllRunners()
  507. require.True(t, llm1.closeCalled)
  508. require.True(t, llm2.closeCalled)
  509. }
  510. func TestUnload(t *testing.T) {
  511. llm1 := &mockLlm{}
  512. r1 := &runnerRef{llama: llm1}
  513. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
  514. r1.unload()
  515. require.True(t, llm1.closeCalled)
  516. r2.unload()
  517. require.Nil(t, r2.model)
  518. }
  519. type mockLlm struct {
  520. pingResp error
  521. waitResp error
  522. completionResp error
  523. embeddingResp []float64
  524. embeddingRespErr error
  525. tokenizeResp []int
  526. tokenizeRespErr error
  527. detokenizeResp string
  528. detonekizeRespErr error
  529. closeResp error
  530. closeCalled bool
  531. estimatedVRAM uint64
  532. estimatedTotal uint64
  533. }
  534. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  535. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  536. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  537. return s.completionResp
  538. }
  539. func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
  540. return s.embeddingResp, s.embeddingRespErr
  541. }
  542. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  543. return s.tokenizeResp, s.tokenizeRespErr
  544. }
  545. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  546. return s.detokenizeResp, s.detonekizeRespErr
  547. }
  548. func (s *mockLlm) Close() error {
  549. s.closeCalled = true
  550. return s.closeResp
  551. }
  552. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  553. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }