sched_test.go 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/envconfig"
  14. "github.com/ollama/ollama/format"
  15. "github.com/ollama/ollama/gpu"
  16. "github.com/ollama/ollama/llm"
  17. "github.com/stretchr/testify/require"
  18. )
  19. func init() {
  20. os.Setenv("OLLAMA_DEBUG", "1")
  21. lifecycle.InitLogging()
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. s := InitScheduler(ctx)
  27. s.loadedMu.Lock()
  28. require.NotNil(t, s.loaded)
  29. s.loadedMu.Unlock()
  30. }
  31. func TestLoad(t *testing.T) {
  32. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  33. defer done()
  34. s := InitScheduler(ctx)
  35. var ggml *llm.GGML // value not used in tests
  36. req := &LlmRequest{
  37. ctx: ctx,
  38. model: &Model{ModelPath: "foo"},
  39. opts: api.DefaultOptions(),
  40. successCh: make(chan *runnerRef, 1),
  41. errCh: make(chan error, 1),
  42. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  43. }
  44. // Fail to load model first
  45. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  46. return nil, fmt.Errorf("something failed to load model blah")
  47. }
  48. gpus := gpu.GpuInfoList{}
  49. s.load(req, ggml, gpus, 0)
  50. require.Empty(t, req.successCh)
  51. require.Len(t, req.errCh, 1)
  52. s.loadedMu.Lock()
  53. require.Empty(t, s.loaded)
  54. s.loadedMu.Unlock()
  55. err := <-req.errCh
  56. require.Contains(t, err.Error(), "this model may be incompatible")
  57. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  58. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  59. return server, nil
  60. }
  61. s.load(req, ggml, gpus, 0)
  62. select {
  63. case err := <-req.errCh:
  64. require.NoError(t, err)
  65. case resp := <-req.successCh:
  66. require.Equal(t, uint64(10), resp.estimatedVRAM)
  67. require.Equal(t, uint(1), resp.refCount)
  68. s.loadedMu.Lock()
  69. require.Len(t, s.loaded, 1)
  70. s.loadedMu.Unlock()
  71. }
  72. req.model.ModelPath = "dummy_model_path"
  73. server.waitResp = fmt.Errorf("wait failure")
  74. s.load(req, ggml, gpus, 0)
  75. select {
  76. case err := <-req.errCh:
  77. require.Contains(t, err.Error(), "wait failure")
  78. case resp := <-req.successCh:
  79. t.Fatalf("unexpected success %v", resp)
  80. }
  81. s.loadedMu.Lock()
  82. runner := s.loaded["dummy_model_path"]
  83. s.loadedMu.Unlock()
  84. require.NotNil(t, runner)
  85. require.Equal(t, uint(0), runner.refCount)
  86. time.Sleep(1 * time.Millisecond)
  87. require.Len(t, s.expiredCh, 1)
  88. }
  89. type reqBundle struct {
  90. ctx context.Context //nolint:containedctx
  91. ctxDone func()
  92. srv *mockLlm
  93. req *LlmRequest
  94. ggml *llm.GGML
  95. }
  96. func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  97. return scenario.srv, nil
  98. }
  99. func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
  100. b := &reqBundle{}
  101. b.ctx, b.ctxDone = context.WithCancel(ctx)
  102. t.Helper()
  103. f, err := os.CreateTemp(t.TempDir(), modelName)
  104. require.NoError(t, err)
  105. defer f.Close()
  106. gguf := llm.NewGGUFV3(binary.LittleEndian)
  107. err = gguf.Encode(f, llm.KV{
  108. "general.architecture": "llama",
  109. "general.name": "name",
  110. "llama.context_length": uint32(32),
  111. "llama.embedding_length": uint32(4096),
  112. "llama.block_count": uint32(1),
  113. "llama.attention.head_count": uint32(32),
  114. "llama.attention.head_count_kv": uint32(32),
  115. "tokenizer.ggml.tokens": []string{" "},
  116. "tokenizer.ggml.scores": []float32{0},
  117. "tokenizer.ggml.token_type": []int32{0},
  118. }, []llm.Tensor{
  119. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  120. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  121. })
  122. require.NoError(t, err)
  123. fname := f.Name()
  124. model := &Model{Name: modelName, ModelPath: fname}
  125. b.ggml, err = llm.LoadModel(model.ModelPath, 0)
  126. require.NoError(t, err)
  127. if duration == nil {
  128. duration = &api.Duration{Duration: 5 * time.Millisecond}
  129. }
  130. b.req = &LlmRequest{
  131. ctx: b.ctx,
  132. model: model,
  133. opts: api.DefaultOptions(),
  134. sessionDuration: duration,
  135. successCh: make(chan *runnerRef, 1),
  136. errCh: make(chan error, 1),
  137. }
  138. b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  139. return b
  140. }
  141. func getGpuFn() gpu.GpuInfoList {
  142. g := gpu.GpuInfo{Library: "metal"}
  143. g.TotalMemory = 24 * format.GigaByte
  144. g.FreeMemory = 12 * format.GigaByte
  145. return []gpu.GpuInfo{g}
  146. }
  147. func getCpuFn() gpu.GpuInfoList {
  148. g := gpu.GpuInfo{Library: "cpu"}
  149. g.TotalMemory = 32 * format.GigaByte
  150. g.FreeMemory = 26 * format.GigaByte
  151. return []gpu.GpuInfo{g}
  152. }
  153. func TestRequestsSameModelSameRequest(t *testing.T) {
  154. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  155. defer done()
  156. s := InitScheduler(ctx)
  157. s.getGpuFn = getGpuFn
  158. s.getCpuFn = getCpuFn
  159. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  160. b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
  161. b.req.model = a.req.model
  162. b.ggml = a.ggml
  163. s.newServerFn = a.newServer
  164. slog.Info("a")
  165. s.pendingReqCh <- a.req
  166. require.Len(t, s.pendingReqCh, 1)
  167. s.Run(ctx)
  168. select {
  169. case resp := <-a.req.successCh:
  170. require.Equal(t, resp.llama, a.srv)
  171. require.Empty(t, s.pendingReqCh)
  172. require.Empty(t, a.req.errCh)
  173. case err := <-a.req.errCh:
  174. t.Fatal(err.Error())
  175. case <-ctx.Done():
  176. t.Fatal("timeout")
  177. }
  178. // Same runner as first request due to not needing a reload
  179. s.newServerFn = b.newServer
  180. slog.Info("b")
  181. s.pendingReqCh <- b.req
  182. select {
  183. case resp := <-b.req.successCh:
  184. require.Equal(t, resp.llama, a.srv)
  185. require.Empty(t, s.pendingReqCh)
  186. require.Empty(t, b.req.errCh)
  187. case err := <-b.req.errCh:
  188. t.Fatal(err.Error())
  189. case <-ctx.Done():
  190. t.Fatal("timeout")
  191. }
  192. }
  193. func TestRequestsSimpleReloadSameModel(t *testing.T) {
  194. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  195. defer done()
  196. s := InitScheduler(ctx)
  197. s.getGpuFn = getGpuFn
  198. s.getCpuFn = getCpuFn
  199. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  200. b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
  201. tmpModel := *a.req.model
  202. b.req.model = &tmpModel
  203. b.ggml = a.ggml
  204. s.newServerFn = a.newServer
  205. slog.Info("a")
  206. s.pendingReqCh <- a.req
  207. require.Len(t, s.pendingReqCh, 1)
  208. s.Run(ctx)
  209. select {
  210. case resp := <-a.req.successCh:
  211. require.Equal(t, resp.llama, a.srv)
  212. require.Empty(t, s.pendingReqCh)
  213. require.Empty(t, a.req.errCh)
  214. case err := <-a.req.errCh:
  215. t.Fatal(err.Error())
  216. case <-ctx.Done():
  217. t.Fatal("timeout")
  218. }
  219. // Trigger a reload
  220. s.newServerFn = b.newServer
  221. b.req.model.AdapterPaths = []string{"new"}
  222. slog.Info("b")
  223. s.pendingReqCh <- b.req
  224. // finish first two requests, so model can reload
  225. time.Sleep(1 * time.Millisecond)
  226. a.ctxDone()
  227. select {
  228. case resp := <-b.req.successCh:
  229. require.Equal(t, resp.llama, b.srv)
  230. require.Empty(t, s.pendingReqCh)
  231. require.Empty(t, b.req.errCh)
  232. case err := <-b.req.errCh:
  233. t.Fatal(err.Error())
  234. case <-ctx.Done():
  235. t.Fatal("timeout")
  236. }
  237. }
  238. func TestRequestsMultipleLoadedModels(t *testing.T) {
  239. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  240. defer done()
  241. s := InitScheduler(ctx)
  242. s.getGpuFn = getGpuFn
  243. s.getCpuFn = getCpuFn
  244. // Multiple loaded models
  245. a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
  246. b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
  247. c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
  248. c.req.opts.NumGPU = 0 // CPU load, will be allowed
  249. d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
  250. envconfig.MaxRunners = 1
  251. s.newServerFn = a.newServer
  252. slog.Info("a")
  253. s.pendingReqCh <- a.req
  254. s.Run(ctx)
  255. select {
  256. case resp := <-a.req.successCh:
  257. require.Equal(t, resp.llama, a.srv)
  258. require.Empty(t, s.pendingReqCh)
  259. require.Empty(t, a.req.errCh)
  260. case err := <-a.req.errCh:
  261. t.Fatal(err.Error())
  262. case <-ctx.Done():
  263. t.Fatal("timeout")
  264. }
  265. s.loadedMu.Lock()
  266. require.Len(t, s.loaded, 1)
  267. s.loadedMu.Unlock()
  268. envconfig.MaxRunners = 0
  269. s.newServerFn = b.newServer
  270. slog.Info("b")
  271. s.pendingReqCh <- b.req
  272. select {
  273. case resp := <-b.req.successCh:
  274. require.Equal(t, resp.llama, b.srv)
  275. require.Empty(t, s.pendingReqCh)
  276. require.Empty(t, b.req.errCh)
  277. case err := <-b.req.errCh:
  278. t.Fatal(err.Error())
  279. case <-ctx.Done():
  280. t.Fatal("timeout")
  281. }
  282. s.loadedMu.Lock()
  283. require.Len(t, s.loaded, 2)
  284. s.loadedMu.Unlock()
  285. // This is a CPU load with NumGPU = 0 so it should load
  286. s.newServerFn = c.newServer
  287. slog.Info("c")
  288. s.pendingReqCh <- c.req
  289. select {
  290. case resp := <-c.req.successCh:
  291. require.Equal(t, resp.llama, c.srv)
  292. require.Empty(t, s.pendingReqCh)
  293. require.Empty(t, c.req.errCh)
  294. case err := <-c.req.errCh:
  295. t.Fatal(err.Error())
  296. case <-ctx.Done():
  297. t.Fatal("timeout")
  298. }
  299. s.loadedMu.Lock()
  300. require.Len(t, s.loaded, 3)
  301. s.loadedMu.Unlock()
  302. // Try to load a model that wont fit
  303. s.newServerFn = d.newServer
  304. slog.Info("d")
  305. s.loadedMu.Lock()
  306. require.Len(t, s.loaded, 3)
  307. s.loadedMu.Unlock()
  308. a.ctxDone() // Won't help since this one isn't big enough to make room
  309. time.Sleep(2 * time.Millisecond)
  310. s.pendingReqCh <- d.req
  311. // finish prior request, so new model can load
  312. time.Sleep(6 * time.Millisecond)
  313. s.loadedMu.Lock()
  314. require.Len(t, s.loaded, 2)
  315. s.loadedMu.Unlock()
  316. b.ctxDone()
  317. select {
  318. case resp := <-d.req.successCh:
  319. require.Equal(t, resp.llama, d.srv)
  320. require.Empty(t, s.pendingReqCh)
  321. require.Empty(t, d.req.errCh)
  322. case <-ctx.Done():
  323. t.Fatal("timeout")
  324. }
  325. s.loadedMu.Lock()
  326. require.Len(t, s.loaded, 2)
  327. s.loadedMu.Unlock()
  328. }
  329. func TestGetRunner(t *testing.T) {
  330. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  331. defer done()
  332. a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
  333. b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
  334. c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
  335. envconfig.MaxQueuedRequests = 1
  336. s := InitScheduler(ctx)
  337. s.getGpuFn = getGpuFn
  338. s.getCpuFn = getCpuFn
  339. s.newServerFn = a.newServer
  340. slog.Info("a")
  341. successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
  342. require.Len(t, s.pendingReqCh, 1)
  343. slog.Info("b")
  344. successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
  345. require.Len(t, s.pendingReqCh, 1)
  346. require.Empty(t, successCh1b)
  347. require.Len(t, errCh1b, 1)
  348. err := <-errCh1b
  349. require.Contains(t, err.Error(), "server busy")
  350. s.Run(ctx)
  351. select {
  352. case resp := <-successCh1a:
  353. require.Equal(t, resp.llama, a.srv)
  354. require.Empty(t, s.pendingReqCh)
  355. require.Empty(t, errCh1a)
  356. case err := <-errCh1a:
  357. t.Fatal(err.Error())
  358. case <-ctx.Done():
  359. t.Fatal("timeout")
  360. }
  361. a.ctxDone() // Set "a" model to idle so it can unload
  362. s.loadedMu.Lock()
  363. require.Len(t, s.loaded, 1)
  364. s.loadedMu.Unlock()
  365. c.req.model.ModelPath = "bad path"
  366. slog.Info("c")
  367. successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
  368. // Starts in pending channel, then should be quickly processsed to return an error
  369. time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
  370. require.Empty(t, successCh1c)
  371. s.loadedMu.Lock()
  372. require.Empty(t, s.loaded)
  373. s.loadedMu.Unlock()
  374. require.Len(t, errCh1c, 1)
  375. err = <-errCh1c
  376. require.Contains(t, err.Error(), "bad path")
  377. b.ctxDone()
  378. }
  379. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  380. func TestPrematureExpired(t *testing.T) {
  381. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  382. defer done()
  383. // Same model, same request
  384. scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
  385. s := InitScheduler(ctx)
  386. s.getGpuFn = func() gpu.GpuInfoList {
  387. g := gpu.GpuInfo{Library: "metal"}
  388. g.TotalMemory = 24 * format.GigaByte
  389. g.FreeMemory = 12 * format.GigaByte
  390. return []gpu.GpuInfo{g}
  391. }
  392. s.newServerFn = scenario1a.newServer
  393. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  394. require.Len(t, s.pendingReqCh, 1)
  395. s.Run(ctx)
  396. select {
  397. case resp := <-successCh1a:
  398. require.Equal(t, resp.llama, scenario1a.srv)
  399. require.Empty(t, s.pendingReqCh)
  400. require.Empty(t, errCh1a)
  401. s.loadedMu.Lock()
  402. require.Len(t, s.loaded, 1)
  403. s.loadedMu.Unlock()
  404. slog.Info("sending premature expired event now")
  405. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  406. case err := <-errCh1a:
  407. t.Fatal(err.Error())
  408. case <-ctx.Done():
  409. t.Fatal("timeout")
  410. }
  411. time.Sleep(scenario1a.req.sessionDuration.Duration)
  412. scenario1a.ctxDone()
  413. time.Sleep(20 * time.Millisecond)
  414. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  415. time.Sleep(10 * time.Millisecond)
  416. require.Empty(t, s.finishedReqCh)
  417. s.loadedMu.Lock()
  418. require.Empty(t, s.loaded)
  419. s.loadedMu.Unlock()
  420. // also shouldn't happen in real life
  421. s.finishedReqCh <- scenario1a.req
  422. time.Sleep(5 * time.Millisecond)
  423. }
  424. func TestUseLoadedRunner(t *testing.T) {
  425. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  426. req := &LlmRequest{
  427. ctx: ctx,
  428. opts: api.DefaultOptions(),
  429. successCh: make(chan *runnerRef, 1),
  430. sessionDuration: &api.Duration{Duration: 2},
  431. }
  432. finished := make(chan *LlmRequest)
  433. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  434. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  435. req.useLoadedRunner(r1, finished)
  436. require.Equal(t, uint(1), r1.refCount)
  437. require.Equal(t, time.Duration(2), r1.sessionDuration)
  438. select {
  439. case success := <-req.successCh:
  440. require.Equal(t, r1, success)
  441. case err := <-req.errCh:
  442. t.Fatal(err.Error())
  443. case <-ctx.Done():
  444. t.Fatal("timeout")
  445. }
  446. done()
  447. fin := <-finished
  448. require.Equal(t, req, fin)
  449. }
  450. func TestUpdateFreeSpace(t *testing.T) {
  451. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  452. defer done()
  453. gpus := gpu.GpuInfoList{
  454. {
  455. Library: "a",
  456. ID: "1",
  457. },
  458. {
  459. Library: "a",
  460. ID: "2",
  461. },
  462. }
  463. gpus[0].TotalMemory = 1000
  464. gpus[0].FreeMemory = 900
  465. gpus[1].TotalMemory = 2000
  466. gpus[1].FreeMemory = 1900
  467. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  468. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  469. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  470. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  471. s := InitScheduler(ctx)
  472. s.loadedMu.Lock()
  473. s.loaded["a"] = r1
  474. s.loaded["b"] = r2
  475. s.loadedMu.Unlock()
  476. s.updateFreeSpace(gpus)
  477. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  478. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  479. }
  480. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  481. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  482. defer done()
  483. gpus := gpu.GpuInfoList{
  484. {
  485. Library: "cuda",
  486. ID: "0",
  487. },
  488. {
  489. Library: "cuda",
  490. ID: "1",
  491. },
  492. }
  493. r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
  494. s := InitScheduler(ctx)
  495. s.loadedMu.Lock()
  496. s.loaded["a"] = r1
  497. s.loadedMu.Unlock()
  498. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  499. require.Len(t, tmp, 1)
  500. require.Equal(t, "1", tmp[0].ID)
  501. r1.gpus = gpu.GpuInfoList{gpus[1]}
  502. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  503. require.Len(t, tmp, 1)
  504. require.Equal(t, "0", tmp[0].ID)
  505. r1.gpus = gpu.GpuInfoList{}
  506. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  507. require.Len(t, tmp, 2)
  508. }
  509. func TestFindRunnerToUnload(t *testing.T) {
  510. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  511. defer done()
  512. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  513. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  514. s := InitScheduler(ctx)
  515. s.loadedMu.Lock()
  516. s.loaded["a"] = r1
  517. s.loaded["b"] = r2
  518. s.loadedMu.Unlock()
  519. resp := s.findRunnerToUnload()
  520. require.Equal(t, r2, resp)
  521. r2.refCount = 1
  522. resp = s.findRunnerToUnload()
  523. require.Equal(t, r1, resp)
  524. }
  525. func TestNeedsReload(t *testing.T) {
  526. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  527. defer done()
  528. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  529. do := api.DefaultOptions()
  530. runner := &runnerRef{
  531. model: &Model{
  532. AdapterPaths: []string{"adapter1"},
  533. ProjectorPaths: []string{"projector1"},
  534. },
  535. Options: &do,
  536. llama: llm,
  537. numParallel: 1,
  538. }
  539. req := &LlmRequest{
  540. model: &Model{
  541. AdapterPaths: []string{"adapter2"},
  542. ProjectorPaths: []string{"projector2"},
  543. },
  544. opts: api.DefaultOptions(),
  545. }
  546. resp := runner.needsReload(ctx, req)
  547. require.True(t, resp)
  548. req.model.AdapterPaths = runner.model.AdapterPaths
  549. resp = runner.needsReload(ctx, req)
  550. require.True(t, resp)
  551. req.model.ProjectorPaths = runner.model.ProjectorPaths
  552. runner.loading = true
  553. req.opts.NumBatch = 1234
  554. resp = runner.needsReload(ctx, req)
  555. require.True(t, resp)
  556. req.opts.NumBatch = runner.Options.NumBatch
  557. llm.pingResp = fmt.Errorf("foo")
  558. resp = runner.needsReload(ctx, req)
  559. require.True(t, resp)
  560. llm.pingResp = nil
  561. resp = runner.needsReload(ctx, req)
  562. require.False(t, resp)
  563. req.opts.NumGPU = 99
  564. resp = runner.needsReload(ctx, req)
  565. require.True(t, resp)
  566. req.opts.NumGPU = -1
  567. resp = runner.needsReload(ctx, req)
  568. require.False(t, resp)
  569. }
  570. func TestUnloadAllRunners(t *testing.T) {
  571. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  572. defer done()
  573. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  574. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  575. s := InitScheduler(ctx)
  576. s.unloadAllRunners()
  577. r1 := &runnerRef{llama: llm1, numParallel: 1}
  578. r2 := &runnerRef{llama: llm2, numParallel: 1}
  579. s.loadedMu.Lock()
  580. s.loaded["a"] = r1
  581. s.loaded["b"] = r2
  582. s.loadedMu.Unlock()
  583. s.unloadAllRunners()
  584. require.True(t, llm1.closeCalled)
  585. require.True(t, llm2.closeCalled)
  586. }
  587. func TestUnload(t *testing.T) {
  588. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  589. r1 := &runnerRef{llama: llm1, numParallel: 1}
  590. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  591. r1.unload()
  592. require.True(t, llm1.closeCalled)
  593. r2.unload()
  594. require.Nil(t, r2.model)
  595. }
  596. func TestAlreadyCanceled(t *testing.T) {
  597. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  598. defer done()
  599. dctx, done2 := context.WithCancel(ctx)
  600. done2()
  601. scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
  602. s := InitScheduler(ctx)
  603. slog.Info("scenario1a")
  604. s.pendingReqCh <- scenario1a.req
  605. require.Len(t, s.pendingReqCh, 1)
  606. s.Run(ctx)
  607. time.Sleep(5 * time.Millisecond)
  608. require.Empty(t, s.pendingReqCh)
  609. require.Empty(t, scenario1a.req.errCh)
  610. require.Empty(t, scenario1a.req.successCh)
  611. }
  612. func TestHomogeneousGPUs(t *testing.T) {
  613. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  614. defer done()
  615. s := InitScheduler(ctx)
  616. s.getGpuFn = func() gpu.GpuInfoList {
  617. // Set memory values to require the model to be spread
  618. gpus := []gpu.GpuInfo{
  619. {Library: "cuda"},
  620. {Library: "rocm"},
  621. }
  622. gpus[0].TotalMemory = 1 * format.GibiByte
  623. gpus[0].FreeMemory = 256 * format.MebiByte
  624. gpus[1].TotalMemory = 1 * format.GibiByte
  625. gpus[1].FreeMemory = 256 * format.MebiByte
  626. return gpus
  627. }
  628. s.getCpuFn = getCpuFn
  629. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  630. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  631. require.Len(t, gpus, 1)
  632. return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
  633. }
  634. slog.Info("a")
  635. s.pendingReqCh <- a.req
  636. require.Len(t, s.pendingReqCh, 1)
  637. s.Run(ctx)
  638. select {
  639. case resp := <-a.req.successCh:
  640. require.Equal(t, resp.llama, a.srv)
  641. require.Empty(t, s.pendingReqCh)
  642. require.Empty(t, a.req.errCh)
  643. case err := <-a.req.errCh:
  644. t.Fatal(err.Error())
  645. case <-ctx.Done():
  646. t.Fatal("timeout")
  647. }
  648. }
  649. type mockLlm struct {
  650. pingResp error
  651. waitResp error
  652. completionResp error
  653. embedResp *llm.EmbedResponse
  654. embedRespErr error
  655. tokenizeResp []int
  656. tokenizeRespErr error
  657. detokenizeResp string
  658. detonekizeRespErr error
  659. closeResp error
  660. closeCalled bool
  661. estimatedVRAM uint64
  662. estimatedTotal uint64
  663. estimatedVRAMByGPU map[string]uint64
  664. }
  665. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  666. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  667. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  668. return s.completionResp
  669. }
  670. func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
  671. return s.embedResp, s.embedRespErr
  672. }
  673. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  674. return s.tokenizeResp, s.tokenizeRespErr
  675. }
  676. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  677. return s.detokenizeResp, s.detonekizeRespErr
  678. }
  679. func (s *mockLlm) Close() error {
  680. s.closeCalled = true
  681. return s.closeResp
  682. }
  683. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  684. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  685. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }