sched_test.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "log/slog"
  7. "os"
  8. "testing"
  9. "time"
  10. "github.com/stretchr/testify/require"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/discover"
  14. "github.com/ollama/ollama/format"
  15. "github.com/ollama/ollama/llm"
  16. )
  17. func TestMain(m *testing.M) {
  18. os.Setenv("OLLAMA_DEBUG", "1")
  19. lifecycle.InitLogging()
  20. os.Exit(m.Run())
  21. }
  22. func TestInitScheduler(t *testing.T) {
  23. ctx, done := context.WithCancel(context.Background())
  24. defer done()
  25. s := InitScheduler(ctx)
  26. s.loadedMu.Lock()
  27. require.NotNil(t, s.loaded)
  28. s.loadedMu.Unlock()
  29. }
  30. func TestLoad(t *testing.T) {
  31. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  32. defer done()
  33. s := InitScheduler(ctx)
  34. var ggml *llm.GGML // value not used in tests
  35. req := &LlmRequest{
  36. ctx: ctx,
  37. model: &Model{ModelPath: "foo"},
  38. opts: api.DefaultOptions(),
  39. successCh: make(chan *runnerRef, 1),
  40. errCh: make(chan error, 1),
  41. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  42. }
  43. // Fail to load model first
  44. s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  45. return nil, errors.New("something failed to load model blah")
  46. }
  47. gpus := discover.GpuInfoList{}
  48. s.load(req, ggml, gpus, 0)
  49. require.Empty(t, req.successCh)
  50. require.Len(t, req.errCh, 1)
  51. s.loadedMu.Lock()
  52. require.Empty(t, s.loaded)
  53. s.loadedMu.Unlock()
  54. err := <-req.errCh
  55. require.Contains(t, err.Error(), "this model may be incompatible")
  56. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  57. s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  58. return server, nil
  59. }
  60. s.load(req, ggml, gpus, 0)
  61. select {
  62. case err := <-req.errCh:
  63. require.NoError(t, err)
  64. case resp := <-req.successCh:
  65. require.Equal(t, uint64(10), resp.estimatedVRAM)
  66. require.Equal(t, uint(1), resp.refCount)
  67. s.loadedMu.Lock()
  68. require.Len(t, s.loaded, 1)
  69. s.loadedMu.Unlock()
  70. }
  71. req.model.ModelPath = "dummy_model_path"
  72. server.waitResp = errors.New("wait failure")
  73. s.load(req, ggml, gpus, 0)
  74. select {
  75. case err := <-req.errCh:
  76. require.Contains(t, err.Error(), "wait failure")
  77. case resp := <-req.successCh:
  78. t.Fatalf("unexpected success %v", resp)
  79. }
  80. s.loadedMu.Lock()
  81. runner := s.loaded["dummy_model_path"]
  82. s.loadedMu.Unlock()
  83. require.NotNil(t, runner)
  84. require.Equal(t, uint(0), runner.refCount)
  85. time.Sleep(1 * time.Millisecond)
  86. require.Len(t, s.expiredCh, 1)
  87. }
  88. type reqBundle struct {
  89. ctx context.Context //nolint:containedctx
  90. ctxDone func()
  91. srv *mockLlm
  92. req *LlmRequest
  93. ggml *llm.GGML
  94. }
  95. func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  96. return scenario.srv, nil
  97. }
  98. func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
  99. b := &reqBundle{}
  100. b.ctx, b.ctxDone = context.WithCancel(ctx)
  101. t.Helper()
  102. f, err := os.CreateTemp(t.TempDir(), modelName)
  103. require.NoError(t, err)
  104. defer f.Close()
  105. require.NoError(t, llm.WriteGGUF(f, llm.KV{
  106. "general.architecture": "llama",
  107. "llama.context_length": uint32(32),
  108. "llama.embedding_length": uint32(4096),
  109. "llama.block_count": uint32(1),
  110. "llama.attention.head_count": uint32(32),
  111. "llama.attention.head_count_kv": uint32(32),
  112. "tokenizer.ggml.tokens": []string{" "},
  113. "tokenizer.ggml.scores": []float32{0},
  114. "tokenizer.ggml.token_type": []int32{0},
  115. }, []llm.Tensor{
  116. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  117. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  118. }))
  119. require.NoError(t, err)
  120. fname := f.Name()
  121. model := &Model{Name: modelName, ModelPath: fname}
  122. b.ggml, err = llm.LoadModel(model.ModelPath, 0)
  123. require.NoError(t, err)
  124. if duration == nil {
  125. duration = &api.Duration{Duration: 5 * time.Millisecond}
  126. }
  127. b.req = &LlmRequest{
  128. ctx: b.ctx,
  129. model: model,
  130. opts: api.DefaultOptions(),
  131. sessionDuration: duration,
  132. successCh: make(chan *runnerRef, 1),
  133. errCh: make(chan error, 1),
  134. }
  135. b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  136. return b
  137. }
  138. func getGpuFn() discover.GpuInfoList {
  139. g := discover.GpuInfo{Library: "metal"}
  140. g.TotalMemory = 24 * format.GigaByte
  141. g.FreeMemory = 12 * format.GigaByte
  142. return []discover.GpuInfo{g}
  143. }
  144. func getCpuFn() discover.GpuInfoList {
  145. g := discover.GpuInfo{Library: "cpu"}
  146. g.TotalMemory = 32 * format.GigaByte
  147. g.FreeMemory = 26 * format.GigaByte
  148. return []discover.GpuInfo{g}
  149. }
  150. func TestRequestsSameModelSameRequest(t *testing.T) {
  151. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  152. defer done()
  153. s := InitScheduler(ctx)
  154. s.getGpuFn = getGpuFn
  155. s.getCpuFn = getCpuFn
  156. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  157. b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
  158. b.req.model = a.req.model
  159. b.ggml = a.ggml
  160. s.newServerFn = a.newServer
  161. slog.Info("a")
  162. s.pendingReqCh <- a.req
  163. require.Len(t, s.pendingReqCh, 1)
  164. s.Run(ctx)
  165. select {
  166. case resp := <-a.req.successCh:
  167. require.Equal(t, resp.llama, a.srv)
  168. require.Empty(t, s.pendingReqCh)
  169. require.Empty(t, a.req.errCh)
  170. case err := <-a.req.errCh:
  171. t.Fatal(err.Error())
  172. case <-ctx.Done():
  173. t.Fatal("timeout")
  174. }
  175. // Same runner as first request due to not needing a reload
  176. s.newServerFn = b.newServer
  177. slog.Info("b")
  178. s.pendingReqCh <- b.req
  179. select {
  180. case resp := <-b.req.successCh:
  181. require.Equal(t, resp.llama, a.srv)
  182. require.Empty(t, s.pendingReqCh)
  183. require.Empty(t, b.req.errCh)
  184. case err := <-b.req.errCh:
  185. t.Fatal(err.Error())
  186. case <-ctx.Done():
  187. t.Fatal("timeout")
  188. }
  189. }
  190. func TestRequestsSimpleReloadSameModel(t *testing.T) {
  191. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  192. defer done()
  193. s := InitScheduler(ctx)
  194. s.getGpuFn = getGpuFn
  195. s.getCpuFn = getCpuFn
  196. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  197. b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
  198. tmpModel := *a.req.model
  199. b.req.model = &tmpModel
  200. b.ggml = a.ggml
  201. s.newServerFn = a.newServer
  202. slog.Info("a")
  203. s.pendingReqCh <- a.req
  204. require.Len(t, s.pendingReqCh, 1)
  205. s.Run(ctx)
  206. select {
  207. case resp := <-a.req.successCh:
  208. require.Equal(t, resp.llama, a.srv)
  209. require.Empty(t, s.pendingReqCh)
  210. require.Empty(t, a.req.errCh)
  211. case err := <-a.req.errCh:
  212. t.Fatal(err.Error())
  213. case <-ctx.Done():
  214. t.Fatal("timeout")
  215. }
  216. // Trigger a reload
  217. s.newServerFn = b.newServer
  218. b.req.model.AdapterPaths = []string{"new"}
  219. slog.Info("b")
  220. s.pendingReqCh <- b.req
  221. // finish first two requests, so model can reload
  222. time.Sleep(1 * time.Millisecond)
  223. a.ctxDone()
  224. select {
  225. case resp := <-b.req.successCh:
  226. require.Equal(t, resp.llama, b.srv)
  227. require.Empty(t, s.pendingReqCh)
  228. require.Empty(t, b.req.errCh)
  229. case err := <-b.req.errCh:
  230. t.Fatal(err.Error())
  231. case <-ctx.Done():
  232. t.Fatal("timeout")
  233. }
  234. }
  235. func TestRequestsMultipleLoadedModels(t *testing.T) {
  236. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  237. defer done()
  238. s := InitScheduler(ctx)
  239. s.getGpuFn = getGpuFn
  240. s.getCpuFn = getCpuFn
  241. // Multiple loaded models
  242. a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
  243. b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
  244. c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
  245. c.req.opts.NumGPU = 0 // CPU load, will be allowed
  246. d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
  247. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
  248. s.newServerFn = a.newServer
  249. slog.Info("a")
  250. s.pendingReqCh <- a.req
  251. s.Run(ctx)
  252. select {
  253. case resp := <-a.req.successCh:
  254. require.Equal(t, resp.llama, a.srv)
  255. require.Empty(t, s.pendingReqCh)
  256. require.Empty(t, a.req.errCh)
  257. case err := <-a.req.errCh:
  258. t.Fatal(err.Error())
  259. case <-ctx.Done():
  260. t.Fatal("timeout")
  261. }
  262. s.loadedMu.Lock()
  263. require.Len(t, s.loaded, 1)
  264. s.loadedMu.Unlock()
  265. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
  266. s.newServerFn = b.newServer
  267. slog.Info("b")
  268. s.pendingReqCh <- b.req
  269. select {
  270. case resp := <-b.req.successCh:
  271. require.Equal(t, resp.llama, b.srv)
  272. require.Empty(t, s.pendingReqCh)
  273. require.Empty(t, b.req.errCh)
  274. case err := <-b.req.errCh:
  275. t.Fatal(err.Error())
  276. case <-ctx.Done():
  277. t.Fatal("timeout")
  278. }
  279. s.loadedMu.Lock()
  280. require.Len(t, s.loaded, 2)
  281. s.loadedMu.Unlock()
  282. // This is a CPU load with NumGPU = 0 so it should load
  283. s.newServerFn = c.newServer
  284. slog.Info("c")
  285. s.pendingReqCh <- c.req
  286. select {
  287. case resp := <-c.req.successCh:
  288. require.Equal(t, resp.llama, c.srv)
  289. require.Empty(t, s.pendingReqCh)
  290. require.Empty(t, c.req.errCh)
  291. case err := <-c.req.errCh:
  292. t.Fatal(err.Error())
  293. case <-ctx.Done():
  294. t.Fatal("timeout")
  295. }
  296. s.loadedMu.Lock()
  297. require.Len(t, s.loaded, 3)
  298. s.loadedMu.Unlock()
  299. // Try to load a model that won't fit
  300. s.newServerFn = d.newServer
  301. slog.Info("d")
  302. s.loadedMu.Lock()
  303. require.Len(t, s.loaded, 3)
  304. s.loadedMu.Unlock()
  305. a.ctxDone() // Won't help since this one isn't big enough to make room
  306. time.Sleep(2 * time.Millisecond)
  307. s.pendingReqCh <- d.req
  308. // finish prior request, so new model can load
  309. time.Sleep(6 * time.Millisecond)
  310. s.loadedMu.Lock()
  311. require.Len(t, s.loaded, 2)
  312. s.loadedMu.Unlock()
  313. b.ctxDone()
  314. select {
  315. case resp := <-d.req.successCh:
  316. require.Equal(t, resp.llama, d.srv)
  317. require.Empty(t, s.pendingReqCh)
  318. require.Empty(t, d.req.errCh)
  319. case <-ctx.Done():
  320. t.Fatal("timeout")
  321. }
  322. s.loadedMu.Lock()
  323. require.Len(t, s.loaded, 2)
  324. s.loadedMu.Unlock()
  325. }
  326. func TestGetRunner(t *testing.T) {
  327. ctx, done := context.WithTimeout(context.Background(), 200*time.Millisecond)
  328. defer done()
  329. a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
  330. b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
  331. c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
  332. t.Setenv("OLLAMA_MAX_QUEUE", "1")
  333. s := InitScheduler(ctx)
  334. s.getGpuFn = getGpuFn
  335. s.getCpuFn = getCpuFn
  336. s.newServerFn = a.newServer
  337. slog.Info("a")
  338. successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
  339. require.Len(t, s.pendingReqCh, 1)
  340. slog.Info("b")
  341. successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
  342. require.Len(t, s.pendingReqCh, 1)
  343. require.Empty(t, successCh1b)
  344. require.Len(t, errCh1b, 1)
  345. err := <-errCh1b
  346. require.Contains(t, err.Error(), "server busy")
  347. s.Run(ctx)
  348. select {
  349. case resp := <-successCh1a:
  350. require.Equal(t, resp.llama, a.srv)
  351. require.Empty(t, s.pendingReqCh)
  352. require.Empty(t, errCh1a)
  353. case err := <-errCh1a:
  354. t.Fatal(err.Error())
  355. case <-ctx.Done():
  356. t.Fatal("timeout")
  357. }
  358. a.ctxDone() // Set "a" model to idle so it can unload
  359. s.loadedMu.Lock()
  360. require.Len(t, s.loaded, 1)
  361. s.loadedMu.Unlock()
  362. c.req.model.ModelPath = "bad path"
  363. slog.Info("c")
  364. successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
  365. // Starts in pending channel, then should be quickly processed to return an error
  366. time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
  367. require.Empty(t, successCh1c)
  368. s.loadedMu.Lock()
  369. require.Empty(t, s.loaded)
  370. s.loadedMu.Unlock()
  371. require.Len(t, errCh1c, 1)
  372. err = <-errCh1c
  373. require.Contains(t, err.Error(), "bad path")
  374. b.ctxDone()
  375. }
  376. func TestExpireRunner(t *testing.T) {
  377. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  378. defer done()
  379. s := InitScheduler(ctx)
  380. req := &LlmRequest{
  381. ctx: ctx,
  382. model: &Model{ModelPath: "foo"},
  383. opts: api.DefaultOptions(),
  384. successCh: make(chan *runnerRef, 1),
  385. errCh: make(chan error, 1),
  386. sessionDuration: &api.Duration{Duration: 2 * time.Minute},
  387. }
  388. var ggml *llm.GGML
  389. gpus := discover.GpuInfoList{}
  390. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  391. s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  392. return server, nil
  393. }
  394. s.load(req, ggml, gpus, 0)
  395. select {
  396. case err := <-req.errCh:
  397. if err != nil {
  398. t.Fatalf("expected no errors when loading, got '%s'", err.Error())
  399. }
  400. case resp := <-req.successCh:
  401. s.loadedMu.Lock()
  402. if resp.refCount != uint(1) || len(s.loaded) != 1 {
  403. t.Fatalf("expected a model to be loaded")
  404. }
  405. s.loadedMu.Unlock()
  406. }
  407. s.expireRunner(&Model{ModelPath: "foo"})
  408. s.finishedReqCh <- req
  409. s.processCompleted(ctx)
  410. s.loadedMu.Lock()
  411. if len(s.loaded) != 0 {
  412. t.Fatalf("expected model to be unloaded")
  413. }
  414. s.loadedMu.Unlock()
  415. }
  416. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  417. func TestPrematureExpired(t *testing.T) {
  418. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  419. defer done()
  420. // Same model, same request
  421. scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
  422. s := InitScheduler(ctx)
  423. s.getGpuFn = func() discover.GpuInfoList {
  424. g := discover.GpuInfo{Library: "metal"}
  425. g.TotalMemory = 24 * format.GigaByte
  426. g.FreeMemory = 12 * format.GigaByte
  427. return []discover.GpuInfo{g}
  428. }
  429. s.newServerFn = scenario1a.newServer
  430. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  431. require.Len(t, s.pendingReqCh, 1)
  432. s.Run(ctx)
  433. select {
  434. case resp := <-successCh1a:
  435. require.Equal(t, resp.llama, scenario1a.srv)
  436. require.Empty(t, s.pendingReqCh)
  437. require.Empty(t, errCh1a)
  438. s.loadedMu.Lock()
  439. require.Len(t, s.loaded, 1)
  440. s.loadedMu.Unlock()
  441. slog.Info("sending premature expired event now")
  442. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  443. case err := <-errCh1a:
  444. t.Fatal(err.Error())
  445. case <-ctx.Done():
  446. t.Fatal("timeout")
  447. }
  448. time.Sleep(scenario1a.req.sessionDuration.Duration)
  449. scenario1a.ctxDone()
  450. time.Sleep(20 * time.Millisecond)
  451. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  452. time.Sleep(10 * time.Millisecond)
  453. require.Empty(t, s.finishedReqCh)
  454. s.loadedMu.Lock()
  455. require.Empty(t, s.loaded)
  456. s.loadedMu.Unlock()
  457. // also shouldn't happen in real life
  458. s.finishedReqCh <- scenario1a.req
  459. time.Sleep(5 * time.Millisecond)
  460. }
  461. func TestUseLoadedRunner(t *testing.T) {
  462. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  463. req := &LlmRequest{
  464. ctx: ctx,
  465. opts: api.DefaultOptions(),
  466. successCh: make(chan *runnerRef, 1),
  467. sessionDuration: &api.Duration{Duration: 2},
  468. }
  469. finished := make(chan *LlmRequest)
  470. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  471. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  472. req.useLoadedRunner(r1, finished)
  473. require.Equal(t, uint(1), r1.refCount)
  474. require.Equal(t, time.Duration(2), r1.sessionDuration)
  475. select {
  476. case success := <-req.successCh:
  477. require.Equal(t, r1, success)
  478. case err := <-req.errCh:
  479. t.Fatal(err.Error())
  480. case <-ctx.Done():
  481. t.Fatal("timeout")
  482. }
  483. done()
  484. fin := <-finished
  485. require.Equal(t, req, fin)
  486. }
  487. func TestUpdateFreeSpace(t *testing.T) {
  488. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  489. defer done()
  490. gpus := discover.GpuInfoList{
  491. {
  492. Library: "a",
  493. ID: "1",
  494. },
  495. {
  496. Library: "a",
  497. ID: "2",
  498. },
  499. }
  500. gpus[0].TotalMemory = 1000
  501. gpus[0].FreeMemory = 900
  502. gpus[1].TotalMemory = 2000
  503. gpus[1].FreeMemory = 1900
  504. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  505. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  506. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  507. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  508. s := InitScheduler(ctx)
  509. s.loadedMu.Lock()
  510. s.loaded["a"] = r1
  511. s.loaded["b"] = r2
  512. s.loadedMu.Unlock()
  513. s.updateFreeSpace(gpus)
  514. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  515. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  516. }
  517. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  518. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  519. defer done()
  520. gpus := discover.GpuInfoList{
  521. {
  522. Library: "cuda",
  523. ID: "0",
  524. },
  525. {
  526. Library: "cuda",
  527. ID: "1",
  528. },
  529. }
  530. r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
  531. s := InitScheduler(ctx)
  532. s.loadedMu.Lock()
  533. s.loaded["a"] = r1
  534. s.loadedMu.Unlock()
  535. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  536. require.Len(t, tmp, 1)
  537. require.Equal(t, "1", tmp[0].ID)
  538. r1.gpus = discover.GpuInfoList{gpus[1]}
  539. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  540. require.Len(t, tmp, 1)
  541. require.Equal(t, "0", tmp[0].ID)
  542. r1.gpus = discover.GpuInfoList{}
  543. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  544. require.Len(t, tmp, 2)
  545. }
  546. func TestFindRunnerToUnload(t *testing.T) {
  547. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  548. defer done()
  549. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  550. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  551. s := InitScheduler(ctx)
  552. s.loadedMu.Lock()
  553. s.loaded["a"] = r1
  554. s.loaded["b"] = r2
  555. s.loadedMu.Unlock()
  556. resp := s.findRunnerToUnload()
  557. require.Equal(t, r2, resp)
  558. r2.refCount = 1
  559. resp = s.findRunnerToUnload()
  560. require.Equal(t, r1, resp)
  561. }
  562. func TestNeedsReload(t *testing.T) {
  563. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  564. defer done()
  565. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  566. do := api.DefaultOptions()
  567. runner := &runnerRef{
  568. model: &Model{
  569. AdapterPaths: []string{"adapter1"},
  570. ProjectorPaths: []string{"projector1"},
  571. },
  572. Options: &do,
  573. llama: llm,
  574. numParallel: 1,
  575. }
  576. req := &LlmRequest{
  577. model: &Model{
  578. AdapterPaths: []string{"adapter2"},
  579. ProjectorPaths: []string{"projector2"},
  580. },
  581. opts: api.DefaultOptions(),
  582. }
  583. resp := runner.needsReload(ctx, req)
  584. require.True(t, resp)
  585. req.model.AdapterPaths = runner.model.AdapterPaths
  586. resp = runner.needsReload(ctx, req)
  587. require.True(t, resp)
  588. req.model.ProjectorPaths = runner.model.ProjectorPaths
  589. runner.loading = true
  590. req.opts.NumBatch = 1234
  591. resp = runner.needsReload(ctx, req)
  592. require.True(t, resp)
  593. req.opts.NumBatch = runner.Options.NumBatch
  594. llm.pingResp = errors.New("foo")
  595. resp = runner.needsReload(ctx, req)
  596. require.True(t, resp)
  597. llm.pingResp = nil
  598. resp = runner.needsReload(ctx, req)
  599. require.False(t, resp)
  600. req.opts.NumGPU = 99
  601. resp = runner.needsReload(ctx, req)
  602. require.True(t, resp)
  603. req.opts.NumGPU = -1
  604. resp = runner.needsReload(ctx, req)
  605. require.False(t, resp)
  606. }
  607. func TestUnloadAllRunners(t *testing.T) {
  608. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  609. defer done()
  610. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  611. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  612. s := InitScheduler(ctx)
  613. s.unloadAllRunners()
  614. r1 := &runnerRef{llama: llm1, numParallel: 1}
  615. r2 := &runnerRef{llama: llm2, numParallel: 1}
  616. s.loadedMu.Lock()
  617. s.loaded["a"] = r1
  618. s.loaded["b"] = r2
  619. s.loadedMu.Unlock()
  620. s.unloadAllRunners()
  621. require.True(t, llm1.closeCalled)
  622. require.True(t, llm2.closeCalled)
  623. }
  624. func TestUnload(t *testing.T) {
  625. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  626. r1 := &runnerRef{llama: llm1, numParallel: 1}
  627. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  628. r1.unload()
  629. require.True(t, llm1.closeCalled)
  630. r2.unload()
  631. require.Nil(t, r2.model)
  632. }
  633. func TestAlreadyCanceled(t *testing.T) {
  634. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  635. defer done()
  636. dctx, done2 := context.WithCancel(ctx)
  637. done2()
  638. scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
  639. s := InitScheduler(ctx)
  640. slog.Info("scenario1a")
  641. s.pendingReqCh <- scenario1a.req
  642. require.Len(t, s.pendingReqCh, 1)
  643. s.Run(ctx)
  644. time.Sleep(5 * time.Millisecond)
  645. require.Empty(t, s.pendingReqCh)
  646. require.Empty(t, scenario1a.req.errCh)
  647. require.Empty(t, scenario1a.req.successCh)
  648. }
  649. func TestHomogeneousGPUs(t *testing.T) {
  650. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  651. defer done()
  652. s := InitScheduler(ctx)
  653. s.getGpuFn = func() discover.GpuInfoList {
  654. // Set memory values to require the model to be spread
  655. gpus := []discover.GpuInfo{
  656. {Library: "cuda"},
  657. {Library: "rocm"},
  658. }
  659. gpus[0].TotalMemory = 1 * format.GibiByte
  660. gpus[0].FreeMemory = 256 * format.MebiByte
  661. gpus[1].TotalMemory = 1 * format.GibiByte
  662. gpus[1].FreeMemory = 256 * format.MebiByte
  663. return gpus
  664. }
  665. s.getCpuFn = getCpuFn
  666. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  667. s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  668. require.Len(t, gpus, 1)
  669. return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
  670. }
  671. slog.Info("a")
  672. s.pendingReqCh <- a.req
  673. require.Len(t, s.pendingReqCh, 1)
  674. s.Run(ctx)
  675. select {
  676. case resp := <-a.req.successCh:
  677. require.Equal(t, resp.llama, a.srv)
  678. require.Empty(t, s.pendingReqCh)
  679. require.Empty(t, a.req.errCh)
  680. case err := <-a.req.errCh:
  681. t.Fatal(err.Error())
  682. case <-ctx.Done():
  683. t.Fatal("timeout")
  684. }
  685. }
  686. type mockLlm struct {
  687. pingResp error
  688. waitResp error
  689. completionResp error
  690. embeddingResp []float32
  691. embeddingRespErr error
  692. tokenizeResp []int
  693. tokenizeRespErr error
  694. detokenizeResp string
  695. detonekizeRespErr error
  696. closeResp error
  697. closeCalled bool
  698. estimatedVRAM uint64
  699. estimatedTotal uint64
  700. estimatedVRAMByGPU map[string]uint64
  701. }
  702. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  703. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  704. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  705. return s.completionResp
  706. }
  707. func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
  708. return s.embeddingResp, s.embeddingRespErr
  709. }
  710. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  711. return s.tokenizeResp, s.tokenizeRespErr
  712. }
  713. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  714. return s.detokenizeResp, s.detonekizeRespErr
  715. }
  716. func (s *mockLlm) Close() error {
  717. s.closeCalled = true
  718. return s.closeResp
  719. }
  720. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  721. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  722. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }