sched_test.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/envconfig"
  14. "github.com/ollama/ollama/format"
  15. "github.com/ollama/ollama/gpu"
  16. "github.com/ollama/ollama/llm"
  17. "github.com/stretchr/testify/require"
  18. )
  19. func init() {
  20. os.Setenv("OLLAMA_DEBUG", "1")
  21. lifecycle.InitLogging()
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. s := InitScheduler(ctx)
  27. s.loadedMu.Lock()
  28. require.NotNil(t, s.loaded)
  29. s.loadedMu.Unlock()
  30. }
  31. func TestLoad(t *testing.T) {
  32. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  33. defer done()
  34. s := InitScheduler(ctx)
  35. var ggml *llm.GGML // value not used in tests
  36. req := &LlmRequest{
  37. ctx: ctx,
  38. model: &Model{ModelPath: "foo"},
  39. opts: api.DefaultOptions(),
  40. successCh: make(chan *runnerRef, 1),
  41. errCh: make(chan error, 1),
  42. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  43. }
  44. // Fail to load model first
  45. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  46. return nil, fmt.Errorf("something failed to load model blah")
  47. }
  48. gpus := gpu.GpuInfoList{}
  49. s.load(req, ggml, gpus, 0)
  50. require.Empty(t, req.successCh)
  51. require.Len(t, req.errCh, 1)
  52. s.loadedMu.Lock()
  53. require.Empty(t, s.loaded)
  54. s.loadedMu.Unlock()
  55. err := <-req.errCh
  56. require.Contains(t, err.Error(), "this model may be incompatible")
  57. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  58. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  59. return server, nil
  60. }
  61. s.load(req, ggml, gpus, 0)
  62. select {
  63. case err := <-req.errCh:
  64. require.NoError(t, err)
  65. case resp := <-req.successCh:
  66. require.Equal(t, uint64(10), resp.estimatedVRAM)
  67. require.Equal(t, uint(1), resp.refCount)
  68. s.loadedMu.Lock()
  69. require.Len(t, s.loaded, 1)
  70. s.loadedMu.Unlock()
  71. }
  72. req.model.ModelPath = "dummy_model_path"
  73. server.waitResp = fmt.Errorf("wait failure")
  74. s.load(req, ggml, gpus, 0)
  75. select {
  76. case err := <-req.errCh:
  77. require.Contains(t, err.Error(), "wait failure")
  78. case resp := <-req.successCh:
  79. t.Fatalf("unexpected success %v", resp)
  80. }
  81. s.loadedMu.Lock()
  82. runner := s.loaded["dummy_model_path"]
  83. s.loadedMu.Unlock()
  84. require.NotNil(t, runner)
  85. require.Equal(t, uint(0), runner.refCount)
  86. time.Sleep(1 * time.Millisecond)
  87. require.Len(t, s.expiredCh, 1)
  88. }
  89. type bundle struct {
  90. ctx context.Context //nolint:containedctx
  91. ctxDone func()
  92. srv *mockLlm
  93. req *LlmRequest
  94. ggml *llm.GGML
  95. }
  96. func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  97. return scenario.srv, nil
  98. }
  99. func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
  100. scenario := &bundle{}
  101. scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
  102. t.Helper()
  103. f, err := os.CreateTemp(t.TempDir(), modelName)
  104. require.NoError(t, err)
  105. defer f.Close()
  106. gguf := llm.NewGGUFV3(binary.LittleEndian)
  107. err = gguf.Encode(f, llm.KV{
  108. "general.architecture": "llama",
  109. "general.name": "name",
  110. "llama.context_length": uint32(32),
  111. "llama.embedding_length": uint32(4096),
  112. "llama.block_count": uint32(1),
  113. "llama.attention.head_count": uint32(32),
  114. "llama.attention.head_count_kv": uint32(32),
  115. "tokenizer.ggml.tokens": []string{" "},
  116. "tokenizer.ggml.scores": []float32{0},
  117. "tokenizer.ggml.token_type": []int32{0},
  118. }, []llm.Tensor{
  119. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  120. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  121. })
  122. require.NoError(t, err)
  123. fname := f.Name()
  124. model := &Model{Name: modelName, ModelPath: fname}
  125. scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
  126. require.NoError(t, err)
  127. scenario.req = &LlmRequest{
  128. ctx: scenario.ctx,
  129. model: model,
  130. opts: api.DefaultOptions(),
  131. sessionDuration: &api.Duration{Duration: 5 * time.Millisecond},
  132. successCh: make(chan *runnerRef, 1),
  133. errCh: make(chan error, 1),
  134. }
  135. scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  136. return scenario
  137. }
  138. func TestRequests(t *testing.T) {
  139. ctx, done := context.WithTimeout(context.Background(), 10*time.Second)
  140. defer done()
  141. // Same model, same request
  142. scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
  143. scenario1a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
  144. scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
  145. scenario1b.req.model = scenario1a.req.model
  146. scenario1b.ggml = scenario1a.ggml
  147. scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
  148. // simple reload of same model
  149. scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
  150. tmpModel := *scenario1a.req.model
  151. scenario2a.req.model = &tmpModel
  152. scenario2a.ggml = scenario1a.ggml
  153. scenario2a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
  154. // Multiple loaded models
  155. scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
  156. scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
  157. scenario3c := newScenario(t, ctx, "ollama-model-4a", 30)
  158. scenario3c.req.opts.NumGPU = 0 // CPU load, will be allowed
  159. scenario3d := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
  160. s := InitScheduler(ctx)
  161. s.getGpuFn = func() gpu.GpuInfoList {
  162. g := gpu.GpuInfo{Library: "metal"}
  163. g.TotalMemory = 24 * format.GigaByte
  164. g.FreeMemory = 12 * format.GigaByte
  165. return []gpu.GpuInfo{g}
  166. }
  167. s.getCpuFn = func() gpu.GpuInfoList {
  168. g := gpu.GpuInfo{Library: "cpu"}
  169. g.TotalMemory = 32 * format.GigaByte
  170. g.FreeMemory = 26 * format.GigaByte
  171. return []gpu.GpuInfo{g}
  172. }
  173. s.newServerFn = scenario1a.newServer
  174. slog.Info("scenario1a")
  175. s.pendingReqCh <- scenario1a.req
  176. require.Len(t, s.pendingReqCh, 1)
  177. s.Run(ctx)
  178. select {
  179. case resp := <-scenario1a.req.successCh:
  180. require.Equal(t, resp.llama, scenario1a.srv)
  181. require.Empty(t, s.pendingReqCh)
  182. require.Empty(t, scenario1a.req.errCh)
  183. case err := <-scenario1a.req.errCh:
  184. t.Fatal(err.Error())
  185. case <-ctx.Done():
  186. t.Fatal("timeout")
  187. }
  188. // Same runner as first request due to not needing a reload
  189. s.newServerFn = scenario1b.newServer
  190. slog.Info("scenario1b")
  191. s.pendingReqCh <- scenario1b.req
  192. select {
  193. case resp := <-scenario1b.req.successCh:
  194. require.Equal(t, resp.llama, scenario1a.srv)
  195. require.Empty(t, s.pendingReqCh)
  196. require.Empty(t, scenario1b.req.errCh)
  197. case err := <-scenario1b.req.errCh:
  198. t.Fatal(err.Error())
  199. case <-ctx.Done():
  200. t.Fatal("timeout")
  201. }
  202. // Trigger a reload
  203. s.newServerFn = scenario2a.newServer
  204. scenario2a.req.model.AdapterPaths = []string{"new"}
  205. slog.Info("scenario2a")
  206. s.pendingReqCh <- scenario2a.req
  207. // finish first two requests, so model can reload
  208. time.Sleep(1 * time.Millisecond)
  209. scenario1a.ctxDone()
  210. scenario1b.ctxDone()
  211. select {
  212. case resp := <-scenario2a.req.successCh:
  213. require.Equal(t, resp.llama, scenario2a.srv)
  214. require.Empty(t, s.pendingReqCh)
  215. require.Empty(t, scenario2a.req.errCh)
  216. case err := <-scenario2a.req.errCh:
  217. t.Fatal(err.Error())
  218. case <-ctx.Done():
  219. t.Fatal("timeout")
  220. }
  221. envconfig.MaxRunners = 1
  222. s.newServerFn = scenario3a.newServer
  223. slog.Info("scenario3a")
  224. s.pendingReqCh <- scenario3a.req
  225. // finish prior request, so new model can load
  226. time.Sleep(1 * time.Millisecond)
  227. scenario2a.ctxDone()
  228. select {
  229. case resp := <-scenario3a.req.successCh:
  230. require.Equal(t, resp.llama, scenario3a.srv)
  231. require.Empty(t, s.pendingReqCh)
  232. require.Empty(t, scenario3a.req.errCh)
  233. case err := <-scenario3a.req.errCh:
  234. t.Fatal(err.Error())
  235. case <-ctx.Done():
  236. t.Fatal("timeout")
  237. }
  238. s.loadedMu.Lock()
  239. require.Len(t, s.loaded, 1)
  240. s.loadedMu.Unlock()
  241. envconfig.MaxRunners = 0
  242. s.newServerFn = scenario3b.newServer
  243. slog.Info("scenario3b")
  244. s.pendingReqCh <- scenario3b.req
  245. select {
  246. case resp := <-scenario3b.req.successCh:
  247. require.Equal(t, resp.llama, scenario3b.srv)
  248. require.Empty(t, s.pendingReqCh)
  249. require.Empty(t, scenario3b.req.errCh)
  250. case err := <-scenario3b.req.errCh:
  251. t.Fatal(err.Error())
  252. case <-ctx.Done():
  253. t.Fatal("timeout")
  254. }
  255. s.loadedMu.Lock()
  256. require.Len(t, s.loaded, 2)
  257. s.loadedMu.Unlock()
  258. // This is a CPU load with NumGPU = 0 so it should load
  259. s.newServerFn = scenario3c.newServer
  260. slog.Info("scenario3c")
  261. s.pendingReqCh <- scenario3c.req
  262. select {
  263. case resp := <-scenario3c.req.successCh:
  264. require.Equal(t, resp.llama, scenario3c.srv)
  265. require.Empty(t, s.pendingReqCh)
  266. require.Empty(t, scenario3c.req.errCh)
  267. case err := <-scenario3c.req.errCh:
  268. t.Fatal(err.Error())
  269. case <-ctx.Done():
  270. t.Fatal("timeout")
  271. }
  272. s.loadedMu.Lock()
  273. require.Len(t, s.loaded, 3)
  274. s.loadedMu.Unlock()
  275. // Try to load a model that wont fit
  276. s.newServerFn = scenario3d.newServer
  277. slog.Info("scenario3d")
  278. s.loadedMu.Lock()
  279. require.Len(t, s.loaded, 3)
  280. s.loadedMu.Unlock()
  281. scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
  282. time.Sleep(2 * time.Millisecond)
  283. s.pendingReqCh <- scenario3d.req
  284. // finish prior request, so new model can load
  285. time.Sleep(6 * time.Millisecond)
  286. s.loadedMu.Lock()
  287. require.Len(t, s.loaded, 2)
  288. s.loadedMu.Unlock()
  289. scenario3b.ctxDone()
  290. select {
  291. case resp := <-scenario3d.req.successCh:
  292. require.Equal(t, resp.llama, scenario3d.srv)
  293. require.Empty(t, s.pendingReqCh)
  294. require.Empty(t, scenario3d.req.errCh)
  295. case <-ctx.Done():
  296. t.Fatal("timeout")
  297. }
  298. s.loadedMu.Lock()
  299. require.Len(t, s.loaded, 2)
  300. s.loadedMu.Unlock()
  301. }
  302. func TestGetRunner(t *testing.T) {
  303. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  304. defer done()
  305. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  306. scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
  307. scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
  308. scenario1b.req.sessionDuration = &api.Duration{Duration: 0}
  309. scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
  310. scenario1c.req.sessionDuration = &api.Duration{Duration: 0}
  311. envconfig.MaxQueuedRequests = 1
  312. s := InitScheduler(ctx)
  313. s.getGpuFn = func() gpu.GpuInfoList {
  314. g := gpu.GpuInfo{Library: "metal"}
  315. g.TotalMemory = 24 * format.GigaByte
  316. g.FreeMemory = 12 * format.GigaByte
  317. return []gpu.GpuInfo{g}
  318. }
  319. s.newServerFn = scenario1a.newServer
  320. slog.Info("scenario1a")
  321. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  322. require.Len(t, s.pendingReqCh, 1)
  323. slog.Info("scenario1b")
  324. successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
  325. require.Len(t, s.pendingReqCh, 1)
  326. require.Empty(t, successCh1b)
  327. require.Len(t, errCh1b, 1)
  328. err := <-errCh1b
  329. require.Contains(t, err.Error(), "server busy")
  330. s.Run(ctx)
  331. select {
  332. case resp := <-successCh1a:
  333. require.Equal(t, resp.llama, scenario1a.srv)
  334. require.Empty(t, s.pendingReqCh)
  335. require.Empty(t, errCh1a)
  336. case <-ctx.Done():
  337. t.Fatal("timeout")
  338. }
  339. scenario1a.ctxDone()
  340. s.loadedMu.Lock()
  341. require.Len(t, s.loaded, 1)
  342. s.loadedMu.Unlock()
  343. scenario1c.req.model.ModelPath = "bad path"
  344. slog.Info("scenario1c")
  345. successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
  346. // Starts in pending channel, then should be quickly processsed to return an error
  347. time.Sleep(5 * time.Millisecond)
  348. require.Empty(t, successCh1c)
  349. s.loadedMu.Lock()
  350. require.Empty(t, s.loaded)
  351. s.loadedMu.Unlock()
  352. require.Len(t, errCh1c, 1)
  353. err = <-errCh1c
  354. require.Contains(t, err.Error(), "bad path")
  355. scenario1b.ctxDone()
  356. }
  357. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  358. func TestPrematureExpired(t *testing.T) {
  359. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  360. defer done()
  361. // Same model, same request
  362. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  363. s := InitScheduler(ctx)
  364. s.getGpuFn = func() gpu.GpuInfoList {
  365. g := gpu.GpuInfo{Library: "metal"}
  366. g.TotalMemory = 24 * format.GigaByte
  367. g.FreeMemory = 12 * format.GigaByte
  368. return []gpu.GpuInfo{g}
  369. }
  370. s.newServerFn = scenario1a.newServer
  371. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  372. require.Len(t, s.pendingReqCh, 1)
  373. s.Run(ctx)
  374. select {
  375. case resp := <-successCh1a:
  376. require.Equal(t, resp.llama, scenario1a.srv)
  377. require.Empty(t, s.pendingReqCh)
  378. require.Empty(t, errCh1a)
  379. s.loadedMu.Lock()
  380. require.Len(t, s.loaded, 1)
  381. s.loadedMu.Unlock()
  382. slog.Info("sending premature expired event now")
  383. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  384. case <-ctx.Done():
  385. t.Fatal("timeout")
  386. }
  387. time.Sleep(scenario1a.req.sessionDuration.Duration)
  388. scenario1a.ctxDone()
  389. time.Sleep(20 * time.Millisecond)
  390. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  391. time.Sleep(10 * time.Millisecond)
  392. require.Empty(t, s.finishedReqCh)
  393. s.loadedMu.Lock()
  394. require.Empty(t, s.loaded)
  395. s.loadedMu.Unlock()
  396. // also shouldn't happen in real life
  397. s.finishedReqCh <- scenario1a.req
  398. time.Sleep(5 * time.Millisecond)
  399. }
  400. func TestUseLoadedRunner(t *testing.T) {
  401. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  402. req := &LlmRequest{
  403. ctx: ctx,
  404. opts: api.DefaultOptions(),
  405. successCh: make(chan *runnerRef, 1),
  406. sessionDuration: &api.Duration{Duration: 2},
  407. }
  408. finished := make(chan *LlmRequest)
  409. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  410. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  411. req.useLoadedRunner(r1, finished)
  412. require.Equal(t, uint(1), r1.refCount)
  413. require.Equal(t, time.Duration(2), r1.sessionDuration)
  414. select {
  415. case success := <-req.successCh:
  416. require.Equal(t, r1, success)
  417. case <-ctx.Done():
  418. t.Fatal("timeout")
  419. }
  420. done()
  421. fin := <-finished
  422. require.Equal(t, req, fin)
  423. }
  424. func TestUpdateFreeSpace(t *testing.T) {
  425. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  426. defer done()
  427. gpus := gpu.GpuInfoList{
  428. {
  429. Library: "a",
  430. ID: "1",
  431. },
  432. {
  433. Library: "a",
  434. ID: "2",
  435. },
  436. }
  437. gpus[0].TotalMemory = 1000
  438. gpus[0].FreeMemory = 900
  439. gpus[1].TotalMemory = 2000
  440. gpus[1].FreeMemory = 1900
  441. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  442. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  443. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  444. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  445. s := InitScheduler(ctx)
  446. s.loadedMu.Lock()
  447. s.loaded["a"] = r1
  448. s.loaded["b"] = r2
  449. s.loadedMu.Unlock()
  450. s.updateFreeSpace(gpus)
  451. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  452. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  453. }
  454. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  455. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  456. defer done()
  457. gpus := gpu.GpuInfoList{
  458. {
  459. Library: "cuda",
  460. ID: "0",
  461. },
  462. {
  463. Library: "cuda",
  464. ID: "1",
  465. },
  466. }
  467. r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
  468. s := InitScheduler(ctx)
  469. s.loadedMu.Lock()
  470. s.loaded["a"] = r1
  471. s.loadedMu.Unlock()
  472. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  473. require.Len(t, tmp, 1)
  474. require.Equal(t, "1", tmp[0].ID)
  475. r1.gpus = gpu.GpuInfoList{gpus[1]}
  476. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  477. require.Len(t, tmp, 1)
  478. require.Equal(t, "0", tmp[0].ID)
  479. r1.gpus = gpu.GpuInfoList{}
  480. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  481. require.Len(t, tmp, 2)
  482. }
  483. func TestFindRunnerToUnload(t *testing.T) {
  484. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  485. defer done()
  486. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  487. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  488. s := InitScheduler(ctx)
  489. s.loadedMu.Lock()
  490. s.loaded["a"] = r1
  491. s.loaded["b"] = r2
  492. s.loadedMu.Unlock()
  493. resp := s.findRunnerToUnload()
  494. require.Equal(t, r2, resp)
  495. r2.refCount = 1
  496. resp = s.findRunnerToUnload()
  497. require.Equal(t, r1, resp)
  498. }
  499. func TestNeedsReload(t *testing.T) {
  500. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  501. defer done()
  502. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  503. do := api.DefaultOptions()
  504. runner := &runnerRef{
  505. model: &Model{
  506. AdapterPaths: []string{"adapter1"},
  507. ProjectorPaths: []string{"projector1"},
  508. },
  509. Options: &do,
  510. llama: llm,
  511. numParallel: 1,
  512. }
  513. req := &LlmRequest{
  514. model: &Model{
  515. AdapterPaths: []string{"adapter2"},
  516. ProjectorPaths: []string{"projector2"},
  517. },
  518. opts: api.DefaultOptions(),
  519. }
  520. resp := runner.needsReload(ctx, req)
  521. require.True(t, resp)
  522. req.model.AdapterPaths = runner.model.AdapterPaths
  523. resp = runner.needsReload(ctx, req)
  524. require.True(t, resp)
  525. req.model.ProjectorPaths = runner.model.ProjectorPaths
  526. runner.loading = true
  527. req.opts.NumBatch = 1234
  528. resp = runner.needsReload(ctx, req)
  529. require.True(t, resp)
  530. req.opts.NumBatch = runner.Options.NumBatch
  531. llm.pingResp = fmt.Errorf("foo")
  532. resp = runner.needsReload(ctx, req)
  533. require.True(t, resp)
  534. llm.pingResp = nil
  535. resp = runner.needsReload(ctx, req)
  536. require.False(t, resp)
  537. req.opts.NumGPU = 99
  538. resp = runner.needsReload(ctx, req)
  539. require.True(t, resp)
  540. req.opts.NumGPU = -1
  541. resp = runner.needsReload(ctx, req)
  542. require.False(t, resp)
  543. }
  544. func TestUnloadAllRunners(t *testing.T) {
  545. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  546. defer done()
  547. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  548. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  549. s := InitScheduler(ctx)
  550. s.unloadAllRunners()
  551. r1 := &runnerRef{llama: llm1, numParallel: 1}
  552. r2 := &runnerRef{llama: llm2, numParallel: 1}
  553. s.loadedMu.Lock()
  554. s.loaded["a"] = r1
  555. s.loaded["b"] = r2
  556. s.loadedMu.Unlock()
  557. s.unloadAllRunners()
  558. require.True(t, llm1.closeCalled)
  559. require.True(t, llm2.closeCalled)
  560. }
  561. func TestUnload(t *testing.T) {
  562. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  563. r1 := &runnerRef{llama: llm1, numParallel: 1}
  564. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  565. r1.unload()
  566. require.True(t, llm1.closeCalled)
  567. r2.unload()
  568. require.Nil(t, r2.model)
  569. }
  570. func TestAlreadyCanceled(t *testing.T) {
  571. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  572. defer done()
  573. dctx, done2 := context.WithCancel(ctx)
  574. done2()
  575. scenario1a := newScenario(t, dctx, "ollama-model-1", 10)
  576. scenario1a.req.sessionDuration = &api.Duration{Duration: 0}
  577. s := InitScheduler(ctx)
  578. slog.Info("scenario1a")
  579. s.pendingReqCh <- scenario1a.req
  580. require.Len(t, s.pendingReqCh, 1)
  581. s.Run(ctx)
  582. time.Sleep(5 * time.Millisecond)
  583. require.Empty(t, s.pendingReqCh)
  584. require.Empty(t, scenario1a.req.errCh)
  585. require.Empty(t, scenario1a.req.successCh)
  586. }
  587. type mockLlm struct {
  588. pingResp error
  589. waitResp error
  590. completionResp error
  591. embedResp [][]float32
  592. embedRespErr error
  593. tokenizeResp []int
  594. tokenizeRespErr error
  595. detokenizeResp string
  596. detonekizeRespErr error
  597. closeResp error
  598. closeCalled bool
  599. estimatedVRAM uint64
  600. estimatedTotal uint64
  601. estimatedVRAMByGPU map[string]uint64
  602. }
  603. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  604. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  605. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  606. return s.completionResp
  607. }
  608. func (s *mockLlm) Embed(ctx context.Context, input []string, images []llm.ImageData) ([][]float32, error) {
  609. return s.embedResp, s.embedRespErr
  610. }
  611. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  612. return s.tokenizeResp, s.tokenizeRespErr
  613. }
  614. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  615. return s.detokenizeResp, s.detonekizeRespErr
  616. }
  617. func (s *mockLlm) Close() error {
  618. s.closeCalled = true
  619. return s.closeResp
  620. }
  621. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  622. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  623. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }