sched_test.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/format"
  14. "github.com/ollama/ollama/gpu"
  15. "github.com/ollama/ollama/llm"
  16. "github.com/stretchr/testify/require"
  17. )
  18. func init() {
  19. os.Setenv("OLLAMA_DEBUG", "1")
  20. lifecycle.InitLogging()
  21. }
  22. func TestInitScheduler(t *testing.T) {
  23. ctx, done := context.WithCancel(context.Background())
  24. defer done()
  25. s := InitScheduler(ctx)
  26. s.loadedMu.Lock()
  27. require.NotNil(t, s.loaded)
  28. s.loadedMu.Unlock()
  29. }
  30. func TestLoad(t *testing.T) {
  31. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  32. defer done()
  33. s := InitScheduler(ctx)
  34. var ggml *llm.GGML // value not used in tests
  35. req := &LlmRequest{
  36. ctx: ctx,
  37. model: &Model{ModelPath: "foo"},
  38. opts: api.DefaultOptions(),
  39. successCh: make(chan *runnerRef, 1),
  40. errCh: make(chan error, 1),
  41. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  42. }
  43. // Fail to load model first
  44. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  45. return nil, fmt.Errorf("something failed to load model blah")
  46. }
  47. gpus := gpu.GpuInfoList{}
  48. s.load(req, ggml, gpus, 0)
  49. require.Empty(t, req.successCh)
  50. require.Len(t, req.errCh, 1)
  51. s.loadedMu.Lock()
  52. require.Empty(t, s.loaded)
  53. s.loadedMu.Unlock()
  54. err := <-req.errCh
  55. require.Contains(t, err.Error(), "this model may be incompatible")
  56. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  57. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  58. return server, nil
  59. }
  60. s.load(req, ggml, gpus, 0)
  61. select {
  62. case err := <-req.errCh:
  63. require.NoError(t, err)
  64. case resp := <-req.successCh:
  65. require.Equal(t, uint64(10), resp.estimatedVRAM)
  66. require.Equal(t, uint(1), resp.refCount)
  67. s.loadedMu.Lock()
  68. require.Len(t, s.loaded, 1)
  69. s.loadedMu.Unlock()
  70. }
  71. req.model.ModelPath = "dummy_model_path"
  72. server.waitResp = fmt.Errorf("wait failure")
  73. s.load(req, ggml, gpus, 0)
  74. select {
  75. case err := <-req.errCh:
  76. require.Contains(t, err.Error(), "wait failure")
  77. case resp := <-req.successCh:
  78. t.Fatalf("unexpected success %v", resp)
  79. }
  80. s.loadedMu.Lock()
  81. runner := s.loaded["dummy_model_path"]
  82. s.loadedMu.Unlock()
  83. require.NotNil(t, runner)
  84. require.Equal(t, uint(0), runner.refCount)
  85. time.Sleep(1 * time.Millisecond)
  86. require.Len(t, s.expiredCh, 1)
  87. }
  88. type reqBundle struct {
  89. ctx context.Context //nolint:containedctx
  90. ctxDone func()
  91. srv *mockLlm
  92. req *LlmRequest
  93. ggml *llm.GGML
  94. }
  95. func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  96. return scenario.srv, nil
  97. }
  98. func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
  99. b := &reqBundle{}
  100. b.ctx, b.ctxDone = context.WithCancel(ctx)
  101. t.Helper()
  102. f, err := os.CreateTemp(t.TempDir(), modelName)
  103. require.NoError(t, err)
  104. defer f.Close()
  105. gguf := llm.NewGGUFV3(binary.LittleEndian)
  106. err = gguf.Encode(f, llm.KV{
  107. "general.architecture": "llama",
  108. "general.name": "name",
  109. "llama.context_length": uint32(32),
  110. "llama.embedding_length": uint32(4096),
  111. "llama.block_count": uint32(1),
  112. "llama.attention.head_count": uint32(32),
  113. "llama.attention.head_count_kv": uint32(32),
  114. "tokenizer.ggml.tokens": []string{" "},
  115. "tokenizer.ggml.scores": []float32{0},
  116. "tokenizer.ggml.token_type": []int32{0},
  117. }, []llm.Tensor{
  118. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  119. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  120. })
  121. require.NoError(t, err)
  122. fname := f.Name()
  123. model := &Model{Name: modelName, ModelPath: fname}
  124. b.ggml, err = llm.LoadModel(model.ModelPath, 0)
  125. require.NoError(t, err)
  126. if duration == nil {
  127. duration = &api.Duration{Duration: 5 * time.Millisecond}
  128. }
  129. b.req = &LlmRequest{
  130. ctx: b.ctx,
  131. model: model,
  132. opts: api.DefaultOptions(),
  133. sessionDuration: duration,
  134. successCh: make(chan *runnerRef, 1),
  135. errCh: make(chan error, 1),
  136. }
  137. b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  138. return b
  139. }
  140. func getGpuFn() gpu.GpuInfoList {
  141. g := gpu.GpuInfo{Library: "metal"}
  142. g.TotalMemory = 24 * format.GigaByte
  143. g.FreeMemory = 12 * format.GigaByte
  144. return []gpu.GpuInfo{g}
  145. }
  146. func getCpuFn() gpu.GpuInfoList {
  147. g := gpu.GpuInfo{Library: "cpu"}
  148. g.TotalMemory = 32 * format.GigaByte
  149. g.FreeMemory = 26 * format.GigaByte
  150. return []gpu.GpuInfo{g}
  151. }
  152. func TestRequestsSameModelSameRequest(t *testing.T) {
  153. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  154. defer done()
  155. s := InitScheduler(ctx)
  156. s.getGpuFn = getGpuFn
  157. s.getCpuFn = getCpuFn
  158. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  159. b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
  160. b.req.model = a.req.model
  161. b.ggml = a.ggml
  162. s.newServerFn = a.newServer
  163. slog.Info("a")
  164. s.pendingReqCh <- a.req
  165. require.Len(t, s.pendingReqCh, 1)
  166. s.Run(ctx)
  167. select {
  168. case resp := <-a.req.successCh:
  169. require.Equal(t, resp.llama, a.srv)
  170. require.Empty(t, s.pendingReqCh)
  171. require.Empty(t, a.req.errCh)
  172. case err := <-a.req.errCh:
  173. t.Fatal(err.Error())
  174. case <-ctx.Done():
  175. t.Fatal("timeout")
  176. }
  177. // Same runner as first request due to not needing a reload
  178. s.newServerFn = b.newServer
  179. slog.Info("b")
  180. s.pendingReqCh <- b.req
  181. select {
  182. case resp := <-b.req.successCh:
  183. require.Equal(t, resp.llama, a.srv)
  184. require.Empty(t, s.pendingReqCh)
  185. require.Empty(t, b.req.errCh)
  186. case err := <-b.req.errCh:
  187. t.Fatal(err.Error())
  188. case <-ctx.Done():
  189. t.Fatal("timeout")
  190. }
  191. }
  192. func TestRequestsSimpleReloadSameModel(t *testing.T) {
  193. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  194. defer done()
  195. s := InitScheduler(ctx)
  196. s.getGpuFn = getGpuFn
  197. s.getCpuFn = getCpuFn
  198. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  199. b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
  200. tmpModel := *a.req.model
  201. b.req.model = &tmpModel
  202. b.ggml = a.ggml
  203. s.newServerFn = a.newServer
  204. slog.Info("a")
  205. s.pendingReqCh <- a.req
  206. require.Len(t, s.pendingReqCh, 1)
  207. s.Run(ctx)
  208. select {
  209. case resp := <-a.req.successCh:
  210. require.Equal(t, resp.llama, a.srv)
  211. require.Empty(t, s.pendingReqCh)
  212. require.Empty(t, a.req.errCh)
  213. case err := <-a.req.errCh:
  214. t.Fatal(err.Error())
  215. case <-ctx.Done():
  216. t.Fatal("timeout")
  217. }
  218. // Trigger a reload
  219. s.newServerFn = b.newServer
  220. b.req.model.AdapterPaths = []string{"new"}
  221. slog.Info("b")
  222. s.pendingReqCh <- b.req
  223. // finish first two requests, so model can reload
  224. time.Sleep(1 * time.Millisecond)
  225. a.ctxDone()
  226. select {
  227. case resp := <-b.req.successCh:
  228. require.Equal(t, resp.llama, b.srv)
  229. require.Empty(t, s.pendingReqCh)
  230. require.Empty(t, b.req.errCh)
  231. case err := <-b.req.errCh:
  232. t.Fatal(err.Error())
  233. case <-ctx.Done():
  234. t.Fatal("timeout")
  235. }
  236. }
  237. func TestRequestsMultipleLoadedModels(t *testing.T) {
  238. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  239. defer done()
  240. s := InitScheduler(ctx)
  241. s.getGpuFn = getGpuFn
  242. s.getCpuFn = getCpuFn
  243. // Multiple loaded models
  244. a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
  245. b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
  246. c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
  247. c.req.opts.NumGPU = 0 // CPU load, will be allowed
  248. d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
  249. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
  250. s.newServerFn = a.newServer
  251. slog.Info("a")
  252. s.pendingReqCh <- a.req
  253. s.Run(ctx)
  254. select {
  255. case resp := <-a.req.successCh:
  256. require.Equal(t, resp.llama, a.srv)
  257. require.Empty(t, s.pendingReqCh)
  258. require.Empty(t, a.req.errCh)
  259. case err := <-a.req.errCh:
  260. t.Fatal(err.Error())
  261. case <-ctx.Done():
  262. t.Fatal("timeout")
  263. }
  264. s.loadedMu.Lock()
  265. require.Len(t, s.loaded, 1)
  266. s.loadedMu.Unlock()
  267. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
  268. s.newServerFn = b.newServer
  269. slog.Info("b")
  270. s.pendingReqCh <- b.req
  271. select {
  272. case resp := <-b.req.successCh:
  273. require.Equal(t, resp.llama, b.srv)
  274. require.Empty(t, s.pendingReqCh)
  275. require.Empty(t, b.req.errCh)
  276. case err := <-b.req.errCh:
  277. t.Fatal(err.Error())
  278. case <-ctx.Done():
  279. t.Fatal("timeout")
  280. }
  281. s.loadedMu.Lock()
  282. require.Len(t, s.loaded, 2)
  283. s.loadedMu.Unlock()
  284. // This is a CPU load with NumGPU = 0 so it should load
  285. s.newServerFn = c.newServer
  286. slog.Info("c")
  287. s.pendingReqCh <- c.req
  288. select {
  289. case resp := <-c.req.successCh:
  290. require.Equal(t, resp.llama, c.srv)
  291. require.Empty(t, s.pendingReqCh)
  292. require.Empty(t, c.req.errCh)
  293. case err := <-c.req.errCh:
  294. t.Fatal(err.Error())
  295. case <-ctx.Done():
  296. t.Fatal("timeout")
  297. }
  298. s.loadedMu.Lock()
  299. require.Len(t, s.loaded, 3)
  300. s.loadedMu.Unlock()
  301. // Try to load a model that wont fit
  302. s.newServerFn = d.newServer
  303. slog.Info("d")
  304. s.loadedMu.Lock()
  305. require.Len(t, s.loaded, 3)
  306. s.loadedMu.Unlock()
  307. a.ctxDone() // Won't help since this one isn't big enough to make room
  308. time.Sleep(2 * time.Millisecond)
  309. s.pendingReqCh <- d.req
  310. // finish prior request, so new model can load
  311. time.Sleep(6 * time.Millisecond)
  312. s.loadedMu.Lock()
  313. require.Len(t, s.loaded, 2)
  314. s.loadedMu.Unlock()
  315. b.ctxDone()
  316. select {
  317. case resp := <-d.req.successCh:
  318. require.Equal(t, resp.llama, d.srv)
  319. require.Empty(t, s.pendingReqCh)
  320. require.Empty(t, d.req.errCh)
  321. case <-ctx.Done():
  322. t.Fatal("timeout")
  323. }
  324. s.loadedMu.Lock()
  325. require.Len(t, s.loaded, 2)
  326. s.loadedMu.Unlock()
  327. }
  328. func TestGetRunner(t *testing.T) {
  329. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  330. defer done()
  331. a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
  332. b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
  333. c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
  334. t.Setenv("OLLAMA_MAX_QUEUE", "1")
  335. s := InitScheduler(ctx)
  336. s.getGpuFn = getGpuFn
  337. s.getCpuFn = getCpuFn
  338. s.newServerFn = a.newServer
  339. slog.Info("a")
  340. successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
  341. require.Len(t, s.pendingReqCh, 1)
  342. slog.Info("b")
  343. successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
  344. require.Len(t, s.pendingReqCh, 1)
  345. require.Empty(t, successCh1b)
  346. require.Len(t, errCh1b, 1)
  347. err := <-errCh1b
  348. require.Contains(t, err.Error(), "server busy")
  349. s.Run(ctx)
  350. select {
  351. case resp := <-successCh1a:
  352. require.Equal(t, resp.llama, a.srv)
  353. require.Empty(t, s.pendingReqCh)
  354. require.Empty(t, errCh1a)
  355. case err := <-errCh1a:
  356. t.Fatal(err.Error())
  357. case <-ctx.Done():
  358. t.Fatal("timeout")
  359. }
  360. a.ctxDone() // Set "a" model to idle so it can unload
  361. s.loadedMu.Lock()
  362. require.Len(t, s.loaded, 1)
  363. s.loadedMu.Unlock()
  364. c.req.model.ModelPath = "bad path"
  365. slog.Info("c")
  366. successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
  367. // Starts in pending channel, then should be quickly processsed to return an error
  368. time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
  369. require.Empty(t, successCh1c)
  370. s.loadedMu.Lock()
  371. require.Empty(t, s.loaded)
  372. s.loadedMu.Unlock()
  373. require.Len(t, errCh1c, 1)
  374. err = <-errCh1c
  375. require.Contains(t, err.Error(), "bad path")
  376. b.ctxDone()
  377. }
  378. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  379. func TestPrematureExpired(t *testing.T) {
  380. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  381. defer done()
  382. // Same model, same request
  383. scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
  384. s := InitScheduler(ctx)
  385. s.getGpuFn = func() gpu.GpuInfoList {
  386. g := gpu.GpuInfo{Library: "metal"}
  387. g.TotalMemory = 24 * format.GigaByte
  388. g.FreeMemory = 12 * format.GigaByte
  389. return []gpu.GpuInfo{g}
  390. }
  391. s.newServerFn = scenario1a.newServer
  392. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  393. require.Len(t, s.pendingReqCh, 1)
  394. s.Run(ctx)
  395. select {
  396. case resp := <-successCh1a:
  397. require.Equal(t, resp.llama, scenario1a.srv)
  398. require.Empty(t, s.pendingReqCh)
  399. require.Empty(t, errCh1a)
  400. s.loadedMu.Lock()
  401. require.Len(t, s.loaded, 1)
  402. s.loadedMu.Unlock()
  403. slog.Info("sending premature expired event now")
  404. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  405. case err := <-errCh1a:
  406. t.Fatal(err.Error())
  407. case <-ctx.Done():
  408. t.Fatal("timeout")
  409. }
  410. time.Sleep(scenario1a.req.sessionDuration.Duration)
  411. scenario1a.ctxDone()
  412. time.Sleep(20 * time.Millisecond)
  413. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  414. time.Sleep(10 * time.Millisecond)
  415. require.Empty(t, s.finishedReqCh)
  416. s.loadedMu.Lock()
  417. require.Empty(t, s.loaded)
  418. s.loadedMu.Unlock()
  419. // also shouldn't happen in real life
  420. s.finishedReqCh <- scenario1a.req
  421. time.Sleep(5 * time.Millisecond)
  422. }
  423. func TestUseLoadedRunner(t *testing.T) {
  424. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  425. req := &LlmRequest{
  426. ctx: ctx,
  427. opts: api.DefaultOptions(),
  428. successCh: make(chan *runnerRef, 1),
  429. sessionDuration: &api.Duration{Duration: 2},
  430. }
  431. finished := make(chan *LlmRequest)
  432. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  433. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  434. req.useLoadedRunner(r1, finished)
  435. require.Equal(t, uint(1), r1.refCount)
  436. require.Equal(t, time.Duration(2), r1.sessionDuration)
  437. select {
  438. case success := <-req.successCh:
  439. require.Equal(t, r1, success)
  440. case err := <-req.errCh:
  441. t.Fatal(err.Error())
  442. case <-ctx.Done():
  443. t.Fatal("timeout")
  444. }
  445. done()
  446. fin := <-finished
  447. require.Equal(t, req, fin)
  448. }
  449. func TestUpdateFreeSpace(t *testing.T) {
  450. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  451. defer done()
  452. gpus := gpu.GpuInfoList{
  453. {
  454. Library: "a",
  455. ID: "1",
  456. },
  457. {
  458. Library: "a",
  459. ID: "2",
  460. },
  461. }
  462. gpus[0].TotalMemory = 1000
  463. gpus[0].FreeMemory = 900
  464. gpus[1].TotalMemory = 2000
  465. gpus[1].FreeMemory = 1900
  466. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  467. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  468. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  469. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  470. s := InitScheduler(ctx)
  471. s.loadedMu.Lock()
  472. s.loaded["a"] = r1
  473. s.loaded["b"] = r2
  474. s.loadedMu.Unlock()
  475. s.updateFreeSpace(gpus)
  476. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  477. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  478. }
  479. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  480. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  481. defer done()
  482. gpus := gpu.GpuInfoList{
  483. {
  484. Library: "cuda",
  485. ID: "0",
  486. },
  487. {
  488. Library: "cuda",
  489. ID: "1",
  490. },
  491. }
  492. r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
  493. s := InitScheduler(ctx)
  494. s.loadedMu.Lock()
  495. s.loaded["a"] = r1
  496. s.loadedMu.Unlock()
  497. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  498. require.Len(t, tmp, 1)
  499. require.Equal(t, "1", tmp[0].ID)
  500. r1.gpus = gpu.GpuInfoList{gpus[1]}
  501. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  502. require.Len(t, tmp, 1)
  503. require.Equal(t, "0", tmp[0].ID)
  504. r1.gpus = gpu.GpuInfoList{}
  505. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  506. require.Len(t, tmp, 2)
  507. }
  508. func TestFindRunnerToUnload(t *testing.T) {
  509. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  510. defer done()
  511. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  512. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  513. s := InitScheduler(ctx)
  514. s.loadedMu.Lock()
  515. s.loaded["a"] = r1
  516. s.loaded["b"] = r2
  517. s.loadedMu.Unlock()
  518. resp := s.findRunnerToUnload()
  519. require.Equal(t, r2, resp)
  520. r2.refCount = 1
  521. resp = s.findRunnerToUnload()
  522. require.Equal(t, r1, resp)
  523. }
  524. func TestNeedsReload(t *testing.T) {
  525. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  526. defer done()
  527. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  528. do := api.DefaultOptions()
  529. runner := &runnerRef{
  530. model: &Model{
  531. AdapterPaths: []string{"adapter1"},
  532. ProjectorPaths: []string{"projector1"},
  533. },
  534. Options: &do,
  535. llama: llm,
  536. numParallel: 1,
  537. }
  538. req := &LlmRequest{
  539. model: &Model{
  540. AdapterPaths: []string{"adapter2"},
  541. ProjectorPaths: []string{"projector2"},
  542. },
  543. opts: api.DefaultOptions(),
  544. }
  545. resp := runner.needsReload(ctx, req)
  546. require.True(t, resp)
  547. req.model.AdapterPaths = runner.model.AdapterPaths
  548. resp = runner.needsReload(ctx, req)
  549. require.True(t, resp)
  550. req.model.ProjectorPaths = runner.model.ProjectorPaths
  551. runner.loading = true
  552. req.opts.NumBatch = 1234
  553. resp = runner.needsReload(ctx, req)
  554. require.True(t, resp)
  555. req.opts.NumBatch = runner.Options.NumBatch
  556. llm.pingResp = fmt.Errorf("foo")
  557. resp = runner.needsReload(ctx, req)
  558. require.True(t, resp)
  559. llm.pingResp = nil
  560. resp = runner.needsReload(ctx, req)
  561. require.False(t, resp)
  562. req.opts.NumGPU = 99
  563. resp = runner.needsReload(ctx, req)
  564. require.True(t, resp)
  565. req.opts.NumGPU = -1
  566. resp = runner.needsReload(ctx, req)
  567. require.False(t, resp)
  568. }
  569. func TestUnloadAllRunners(t *testing.T) {
  570. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  571. defer done()
  572. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  573. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  574. s := InitScheduler(ctx)
  575. s.unloadAllRunners()
  576. r1 := &runnerRef{llama: llm1, numParallel: 1}
  577. r2 := &runnerRef{llama: llm2, numParallel: 1}
  578. s.loadedMu.Lock()
  579. s.loaded["a"] = r1
  580. s.loaded["b"] = r2
  581. s.loadedMu.Unlock()
  582. s.unloadAllRunners()
  583. require.True(t, llm1.closeCalled)
  584. require.True(t, llm2.closeCalled)
  585. }
  586. func TestUnload(t *testing.T) {
  587. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  588. r1 := &runnerRef{llama: llm1, numParallel: 1}
  589. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  590. r1.unload()
  591. require.True(t, llm1.closeCalled)
  592. r2.unload()
  593. require.Nil(t, r2.model)
  594. }
  595. func TestAlreadyCanceled(t *testing.T) {
  596. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  597. defer done()
  598. dctx, done2 := context.WithCancel(ctx)
  599. done2()
  600. scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
  601. s := InitScheduler(ctx)
  602. slog.Info("scenario1a")
  603. s.pendingReqCh <- scenario1a.req
  604. require.Len(t, s.pendingReqCh, 1)
  605. s.Run(ctx)
  606. time.Sleep(5 * time.Millisecond)
  607. require.Empty(t, s.pendingReqCh)
  608. require.Empty(t, scenario1a.req.errCh)
  609. require.Empty(t, scenario1a.req.successCh)
  610. }
  611. type mockLlm struct {
  612. pingResp error
  613. waitResp error
  614. completionResp error
  615. embedResp [][]float32
  616. embedRespErr error
  617. tokenizeResp []int
  618. tokenizeRespErr error
  619. detokenizeResp string
  620. detonekizeRespErr error
  621. closeResp error
  622. closeCalled bool
  623. estimatedVRAM uint64
  624. estimatedTotal uint64
  625. estimatedVRAMByGPU map[string]uint64
  626. }
  627. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  628. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  629. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  630. return s.completionResp
  631. }
  632. func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
  633. return s.embedResp, s.embedRespErr
  634. }
  635. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  636. return s.tokenizeResp, s.tokenizeRespErr
  637. }
  638. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  639. return s.detokenizeResp, s.detonekizeRespErr
  640. }
  641. func (s *mockLlm) Close() error {
  642. s.closeCalled = true
  643. return s.closeResp
  644. }
  645. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  646. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  647. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }