sched_test.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/envconfig"
  14. "github.com/ollama/ollama/format"
  15. "github.com/ollama/ollama/gpu"
  16. "github.com/ollama/ollama/llm"
  17. "github.com/stretchr/testify/require"
  18. )
  19. func init() {
  20. os.Setenv("OLLAMA_DEBUG", "1")
  21. lifecycle.InitLogging()
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. s := InitScheduler(ctx)
  27. s.loadedMu.Lock()
  28. require.NotNil(t, s.loaded)
  29. s.loadedMu.Unlock()
  30. }
  31. func TestLoad(t *testing.T) {
  32. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  33. defer done()
  34. s := InitScheduler(ctx)
  35. var ggml *llm.GGML // value not used in tests
  36. req := &LlmRequest{
  37. ctx: ctx,
  38. model: &Model{ModelPath: "foo"},
  39. opts: api.DefaultOptions(),
  40. successCh: make(chan *runnerRef, 1),
  41. errCh: make(chan error, 1),
  42. sessionDuration: 2,
  43. }
  44. // Fail to load model first
  45. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  46. return nil, fmt.Errorf("something failed to load model blah")
  47. }
  48. gpus := gpu.GpuInfoList{}
  49. s.load(req, ggml, gpus)
  50. require.Empty(t, req.successCh)
  51. require.Len(t, req.errCh, 1)
  52. s.loadedMu.Lock()
  53. require.Empty(t, s.loaded)
  54. s.loadedMu.Unlock()
  55. err := <-req.errCh
  56. require.Contains(t, err.Error(), "this model may be incompatible")
  57. server := &mockLlm{estimatedVRAM: 10}
  58. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  59. return server, nil
  60. }
  61. s.load(req, ggml, gpus)
  62. select {
  63. case err := <-req.errCh:
  64. require.NoError(t, err)
  65. case resp := <-req.successCh:
  66. require.Equal(t, uint64(10), resp.estimatedVRAM)
  67. require.Equal(t, uint(1), resp.refCount)
  68. s.loadedMu.Lock()
  69. require.Len(t, s.loaded, 1)
  70. s.loadedMu.Unlock()
  71. }
  72. req.model.ModelPath = "dummy_model_path"
  73. server.waitResp = fmt.Errorf("wait failure")
  74. s.load(req, ggml, gpus)
  75. select {
  76. case err := <-req.errCh:
  77. require.Contains(t, err.Error(), "wait failure")
  78. case resp := <-req.successCh:
  79. t.Errorf("unexpected success %v", resp)
  80. }
  81. s.loadedMu.Lock()
  82. runner := s.loaded["dummy_model_path"]
  83. s.loadedMu.Unlock()
  84. require.NotNil(t, runner)
  85. require.Equal(t, uint(0), runner.refCount)
  86. time.Sleep(1 * time.Millisecond)
  87. require.Len(t, s.expiredCh, 1)
  88. }
  89. type bundle struct {
  90. ctx context.Context //nolint:containedctx
  91. ctxDone func()
  92. srv *mockLlm
  93. req *LlmRequest
  94. ggml *llm.GGML
  95. }
  96. func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  97. return scenario.srv, nil
  98. }
  99. func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
  100. scenario := &bundle{}
  101. scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
  102. t.Helper()
  103. f, err := os.CreateTemp(t.TempDir(), modelName)
  104. require.NoError(t, err)
  105. defer f.Close()
  106. gguf := llm.NewGGUFV3(binary.LittleEndian)
  107. err = gguf.Encode(f, llm.KV{
  108. "general.architecture": "llama",
  109. "general.name": "name",
  110. "llama.context_length": uint32(32),
  111. "llama.embedding_length": uint32(4096),
  112. "llama.block_count": uint32(1),
  113. "llama.attention.head_count": uint32(32),
  114. "llama.attention.head_count_kv": uint32(32),
  115. "tokenizer.ggml.tokens": []string{" "},
  116. "tokenizer.ggml.scores": []float32{0},
  117. "tokenizer.ggml.token_type": []int32{0},
  118. }, []llm.Tensor{
  119. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  120. })
  121. require.NoError(t, err)
  122. fname := f.Name()
  123. model := &Model{Name: modelName, ModelPath: fname}
  124. scenario.ggml, err = llm.LoadModel(model.ModelPath)
  125. require.NoError(t, err)
  126. scenario.req = &LlmRequest{
  127. ctx: scenario.ctx,
  128. model: model,
  129. opts: api.DefaultOptions(),
  130. sessionDuration: 5 * time.Millisecond,
  131. successCh: make(chan *runnerRef, 1),
  132. errCh: make(chan error, 1),
  133. }
  134. scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
  135. return scenario
  136. }
  137. func TestRequests(t *testing.T) {
  138. ctx, done := context.WithTimeout(context.Background(), time.Second)
  139. defer done()
  140. // Same model, same request
  141. scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
  142. scenario1a.req.sessionDuration = 0
  143. scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
  144. scenario1b.req.model = scenario1a.req.model
  145. scenario1b.ggml = scenario1a.ggml
  146. scenario1b.req.sessionDuration = 0
  147. // simple reload of same model
  148. scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
  149. tmpModel := *scenario1a.req.model
  150. scenario2a.req.model = &tmpModel
  151. scenario2a.ggml = scenario1a.ggml
  152. // Multiple loaded models
  153. scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
  154. scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
  155. scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
  156. scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
  157. scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
  158. s := InitScheduler(ctx)
  159. s.getGpuFn = func() gpu.GpuInfoList {
  160. g := gpu.GpuInfo{Library: "metal"}
  161. g.TotalMemory = 24 * format.GigaByte
  162. g.FreeMemory = 12 * format.GigaByte
  163. return []gpu.GpuInfo{g}
  164. }
  165. s.newServerFn = scenario1a.newServer
  166. slog.Info("scenario1a")
  167. s.pendingReqCh <- scenario1a.req
  168. require.Len(t, s.pendingReqCh, 1)
  169. s.Run(ctx)
  170. select {
  171. case resp := <-scenario1a.req.successCh:
  172. require.Equal(t, resp.llama, scenario1a.srv)
  173. require.Empty(t, s.pendingReqCh)
  174. require.Empty(t, scenario1a.req.errCh)
  175. case <-ctx.Done():
  176. t.Errorf("timeout")
  177. }
  178. // Same runner as first request due to not needing a reload
  179. s.newServerFn = scenario1b.newServer
  180. slog.Info("scenario1b")
  181. s.pendingReqCh <- scenario1b.req
  182. select {
  183. case resp := <-scenario1b.req.successCh:
  184. require.Equal(t, resp.llama, scenario1a.srv)
  185. require.Empty(t, s.pendingReqCh)
  186. require.Empty(t, scenario1b.req.errCh)
  187. case <-ctx.Done():
  188. t.Errorf("timeout")
  189. }
  190. // Trigger a reload
  191. s.newServerFn = scenario2a.newServer
  192. scenario2a.req.model.AdapterPaths = []string{"new"}
  193. slog.Info("scenario2a")
  194. s.pendingReqCh <- scenario2a.req
  195. // finish first two requests, so model can reload
  196. time.Sleep(1 * time.Millisecond)
  197. scenario1a.ctxDone()
  198. scenario1b.ctxDone()
  199. select {
  200. case resp := <-scenario2a.req.successCh:
  201. require.Equal(t, resp.llama, scenario2a.srv)
  202. require.Empty(t, s.pendingReqCh)
  203. require.Empty(t, scenario2a.req.errCh)
  204. case <-ctx.Done():
  205. t.Errorf("timeout")
  206. }
  207. envconfig.MaxRunners = 1
  208. s.newServerFn = scenario3a.newServer
  209. slog.Info("scenario3a")
  210. s.pendingReqCh <- scenario3a.req
  211. // finish prior request, so new model can load
  212. time.Sleep(1 * time.Millisecond)
  213. scenario2a.ctxDone()
  214. select {
  215. case resp := <-scenario3a.req.successCh:
  216. require.Equal(t, resp.llama, scenario3a.srv)
  217. require.Empty(t, s.pendingReqCh)
  218. require.Empty(t, scenario3a.req.errCh)
  219. case <-ctx.Done():
  220. t.Errorf("timeout")
  221. }
  222. s.loadedMu.Lock()
  223. require.Len(t, s.loaded, 1)
  224. s.loadedMu.Unlock()
  225. envconfig.MaxRunners = 0
  226. s.newServerFn = scenario3b.newServer
  227. slog.Info("scenario3b")
  228. s.pendingReqCh <- scenario3b.req
  229. select {
  230. case resp := <-scenario3b.req.successCh:
  231. require.Equal(t, resp.llama, scenario3b.srv)
  232. require.Empty(t, s.pendingReqCh)
  233. require.Empty(t, scenario3b.req.errCh)
  234. case <-ctx.Done():
  235. t.Errorf("timeout")
  236. }
  237. s.loadedMu.Lock()
  238. require.Len(t, s.loaded, 2)
  239. s.loadedMu.Unlock()
  240. // This is a CPU load with NumGPU = 0 so it should load
  241. s.newServerFn = scenario3c.newServer
  242. slog.Info("scenario3c")
  243. s.pendingReqCh <- scenario3c.req
  244. select {
  245. case resp := <-scenario3c.req.successCh:
  246. require.Equal(t, resp.llama, scenario3c.srv)
  247. require.Empty(t, s.pendingReqCh)
  248. require.Empty(t, scenario3c.req.errCh)
  249. case <-ctx.Done():
  250. t.Errorf("timeout")
  251. }
  252. s.loadedMu.Lock()
  253. require.Len(t, s.loaded, 3)
  254. s.loadedMu.Unlock()
  255. // Try to load a model that wont fit
  256. s.newServerFn = scenario3d.newServer
  257. slog.Info("scenario3d")
  258. s.loadedMu.Lock()
  259. require.Len(t, s.loaded, 3)
  260. s.loadedMu.Unlock()
  261. scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
  262. time.Sleep(2 * time.Millisecond)
  263. s.pendingReqCh <- scenario3d.req
  264. // finish prior request, so new model can load
  265. time.Sleep(6 * time.Millisecond)
  266. s.loadedMu.Lock()
  267. require.Len(t, s.loaded, 2)
  268. s.loadedMu.Unlock()
  269. scenario3b.ctxDone()
  270. select {
  271. case resp := <-scenario3d.req.successCh:
  272. require.Equal(t, resp.llama, scenario3d.srv)
  273. require.Empty(t, s.pendingReqCh)
  274. require.Empty(t, scenario3d.req.errCh)
  275. case <-ctx.Done():
  276. t.Errorf("timeout")
  277. }
  278. s.loadedMu.Lock()
  279. require.Len(t, s.loaded, 2)
  280. s.loadedMu.Unlock()
  281. }
  282. func TestGetRunner(t *testing.T) {
  283. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  284. defer done()
  285. // Same model, same request
  286. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  287. scenario1a.req.sessionDuration = 0
  288. scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
  289. scenario1b.req.sessionDuration = 0
  290. scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
  291. scenario1c.req.sessionDuration = 0
  292. envconfig.MaxQueuedRequests = 1
  293. s := InitScheduler(ctx)
  294. s.getGpuFn = func() gpu.GpuInfoList {
  295. g := gpu.GpuInfo{Library: "metal"}
  296. g.TotalMemory = 24 * format.GigaByte
  297. g.FreeMemory = 12 * format.GigaByte
  298. return []gpu.GpuInfo{g}
  299. }
  300. s.newServerFn = scenario1a.newServer
  301. slog.Info("scenario1a")
  302. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  303. require.Len(t, s.pendingReqCh, 1)
  304. slog.Info("scenario1b")
  305. successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
  306. require.Len(t, s.pendingReqCh, 1)
  307. require.Empty(t, successCh1b)
  308. require.Len(t, errCh1b, 1)
  309. err := <-errCh1b
  310. require.Contains(t, err.Error(), "server busy")
  311. s.Run(ctx)
  312. select {
  313. case resp := <-successCh1a:
  314. require.Equal(t, resp.llama, scenario1a.srv)
  315. require.Empty(t, s.pendingReqCh)
  316. require.Empty(t, errCh1a)
  317. case <-ctx.Done():
  318. t.Errorf("timeout")
  319. }
  320. scenario1a.ctxDone()
  321. s.loadedMu.Lock()
  322. require.Len(t, s.loaded, 1)
  323. s.loadedMu.Unlock()
  324. scenario1c.req.model.ModelPath = "bad path"
  325. slog.Info("scenario1c")
  326. successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
  327. // Starts in pending channel, then should be quickly processsed to return an error
  328. time.Sleep(5 * time.Millisecond)
  329. require.Empty(t, successCh1c)
  330. s.loadedMu.Lock()
  331. require.Empty(t, s.loaded)
  332. s.loadedMu.Unlock()
  333. require.Len(t, errCh1c, 1)
  334. err = <-errCh1c
  335. require.Contains(t, err.Error(), "bad path")
  336. scenario1b.ctxDone()
  337. }
  338. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  339. func TestPrematureExpired(t *testing.T) {
  340. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  341. defer done()
  342. // Same model, same request
  343. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  344. s := InitScheduler(ctx)
  345. s.getGpuFn = func() gpu.GpuInfoList {
  346. g := gpu.GpuInfo{Library: "metal"}
  347. g.TotalMemory = 24 * format.GigaByte
  348. g.FreeMemory = 12 * format.GigaByte
  349. return []gpu.GpuInfo{g}
  350. }
  351. s.newServerFn = scenario1a.newServer
  352. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  353. require.Len(t, s.pendingReqCh, 1)
  354. s.Run(ctx)
  355. select {
  356. case resp := <-successCh1a:
  357. require.Equal(t, resp.llama, scenario1a.srv)
  358. require.Empty(t, s.pendingReqCh)
  359. require.Empty(t, errCh1a)
  360. s.loadedMu.Lock()
  361. require.Len(t, s.loaded, 1)
  362. s.loadedMu.Unlock()
  363. slog.Info("sending premature expired event now")
  364. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  365. case <-ctx.Done():
  366. t.Errorf("timeout")
  367. }
  368. time.Sleep(scenario1a.req.sessionDuration)
  369. scenario1a.ctxDone()
  370. time.Sleep(20 * time.Millisecond)
  371. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  372. time.Sleep(10 * time.Millisecond)
  373. require.Empty(t, s.finishedReqCh)
  374. s.loadedMu.Lock()
  375. require.Empty(t, s.loaded)
  376. s.loadedMu.Unlock()
  377. // also shouldn't happen in real life
  378. s.finishedReqCh <- scenario1a.req
  379. time.Sleep(5 * time.Millisecond)
  380. }
  381. func TestUseLoadedRunner(t *testing.T) {
  382. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  383. req := &LlmRequest{
  384. ctx: ctx,
  385. opts: api.DefaultOptions(),
  386. successCh: make(chan *runnerRef, 1),
  387. sessionDuration: 2,
  388. }
  389. finished := make(chan *LlmRequest)
  390. llm1 := &mockLlm{}
  391. r1 := &runnerRef{llama: llm1, sessionDuration: 1}
  392. req.useLoadedRunner(r1, finished)
  393. require.Equal(t, uint(1), r1.refCount)
  394. require.Equal(t, time.Duration(2), r1.sessionDuration)
  395. select {
  396. case success := <-req.successCh:
  397. require.Equal(t, r1, success)
  398. case <-ctx.Done():
  399. t.Errorf("timeout")
  400. }
  401. done()
  402. fin := <-finished
  403. require.Equal(t, req, fin)
  404. }
  405. func TestUpdateFreeSpace(t *testing.T) {
  406. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  407. defer done()
  408. gpus := gpu.GpuInfoList{
  409. {
  410. Library: "a",
  411. ID: "1",
  412. },
  413. {
  414. Library: "a",
  415. ID: "2",
  416. },
  417. }
  418. gpus[0].TotalMemory = 1000
  419. gpus[0].FreeMemory = 900
  420. gpus[1].TotalMemory = 2000
  421. gpus[1].FreeMemory = 1900
  422. llm1 := &mockLlm{estimatedVRAM: 100}
  423. llm2 := &mockLlm{estimatedVRAM: 200}
  424. r1 := &runnerRef{llama: llm1, gpus: gpus}
  425. r2 := &runnerRef{llama: llm2, gpus: gpus}
  426. s := InitScheduler(ctx)
  427. s.loadedMu.Lock()
  428. s.loaded["a"] = r1
  429. s.loaded["b"] = r2
  430. s.loadedMu.Unlock()
  431. s.updateFreeSpace(gpus)
  432. require.Equal(t, uint64(850), gpus[0].FreeMemory)
  433. require.Equal(t, uint64(1850), gpus[1].FreeMemory)
  434. }
  435. func TestFindRunnerToUnload(t *testing.T) {
  436. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  437. defer done()
  438. r1 := &runnerRef{refCount: 1, sessionDuration: 1}
  439. r2 := &runnerRef{sessionDuration: 2}
  440. s := InitScheduler(ctx)
  441. s.loadedMu.Lock()
  442. s.loaded["a"] = r1
  443. s.loaded["b"] = r2
  444. s.loadedMu.Unlock()
  445. resp := s.findRunnerToUnload()
  446. require.Equal(t, r2, resp)
  447. r2.refCount = 1
  448. resp = s.findRunnerToUnload()
  449. require.Equal(t, r1, resp)
  450. }
  451. func TestNeedsReload(t *testing.T) {
  452. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  453. defer done()
  454. llm := &mockLlm{}
  455. do := api.DefaultOptions()
  456. runner := &runnerRef{
  457. model: &Model{AdapterPaths: []string{"adapter1"}, ProjectorPaths: []string{"projector1"}},
  458. Options: &do,
  459. llama: llm,
  460. }
  461. req := &LlmRequest{
  462. model: &Model{
  463. AdapterPaths: []string{"adapter2"},
  464. ProjectorPaths: []string{"projector2"},
  465. },
  466. opts: api.DefaultOptions(),
  467. }
  468. resp := runner.needsReload(ctx, req)
  469. require.True(t, resp)
  470. req.model.AdapterPaths = runner.model.AdapterPaths
  471. resp = runner.needsReload(ctx, req)
  472. require.True(t, resp)
  473. req.model.ProjectorPaths = runner.model.ProjectorPaths
  474. runner.loading = true
  475. req.opts.NumBatch = 1234
  476. resp = runner.needsReload(ctx, req)
  477. require.True(t, resp)
  478. req.opts.NumBatch = runner.Options.NumBatch
  479. llm.pingResp = fmt.Errorf("foo")
  480. resp = runner.needsReload(ctx, req)
  481. require.True(t, resp)
  482. llm.pingResp = nil
  483. resp = runner.needsReload(ctx, req)
  484. require.False(t, resp)
  485. req.opts.NumGPU = 99
  486. resp = runner.needsReload(ctx, req)
  487. require.True(t, resp)
  488. req.opts.NumGPU = -1
  489. resp = runner.needsReload(ctx, req)
  490. require.False(t, resp)
  491. }
  492. func TestUnloadAllRunners(t *testing.T) {
  493. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  494. defer done()
  495. llm1 := &mockLlm{}
  496. llm2 := &mockLlm{}
  497. s := InitScheduler(ctx)
  498. s.unloadAllRunners()
  499. r1 := &runnerRef{llama: llm1}
  500. r2 := &runnerRef{llama: llm2}
  501. s.loadedMu.Lock()
  502. s.loaded["a"] = r1
  503. s.loaded["b"] = r2
  504. s.loadedMu.Unlock()
  505. s.unloadAllRunners()
  506. require.True(t, llm1.closeCalled)
  507. require.True(t, llm2.closeCalled)
  508. }
  509. func TestUnload(t *testing.T) {
  510. llm1 := &mockLlm{}
  511. r1 := &runnerRef{llama: llm1}
  512. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}}
  513. r1.unload()
  514. require.True(t, llm1.closeCalled)
  515. r2.unload()
  516. require.Nil(t, r2.model)
  517. }
  518. type mockLlm struct {
  519. pingResp error
  520. waitResp error
  521. completionResp error
  522. embeddingResp []float64
  523. embeddingRespErr error
  524. tokenizeResp []int
  525. tokenizeRespErr error
  526. detokenizeResp string
  527. detonekizeRespErr error
  528. closeResp error
  529. closeCalled bool
  530. estimatedVRAM uint64
  531. estimatedTotal uint64
  532. }
  533. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  534. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  535. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  536. return s.completionResp
  537. }
  538. func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
  539. return s.embeddingResp, s.embeddingRespErr
  540. }
  541. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  542. return s.tokenizeResp, s.tokenizeRespErr
  543. }
  544. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  545. return s.detokenizeResp, s.detonekizeRespErr
  546. }
  547. func (s *mockLlm) Close() error {
  548. s.closeCalled = true
  549. return s.closeResp
  550. }
  551. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  552. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }