sched_test.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "log/slog"
  7. "os"
  8. "testing"
  9. "time"
  10. "github.com/stretchr/testify/require"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/discover"
  14. "github.com/ollama/ollama/format"
  15. "github.com/ollama/ollama/fs/ggml"
  16. "github.com/ollama/ollama/llm"
  17. )
  18. func TestMain(m *testing.M) {
  19. os.Setenv("OLLAMA_DEBUG", "1")
  20. lifecycle.InitLogging()
  21. os.Exit(m.Run())
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. s := InitScheduler(ctx)
  27. s.loadedMu.Lock()
  28. require.NotNil(t, s.loaded)
  29. s.loadedMu.Unlock()
  30. }
  31. func TestLoad(t *testing.T) {
  32. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  33. defer done()
  34. s := InitScheduler(ctx)
  35. var f *ggml.GGML // value not used in tests
  36. req := &LlmRequest{
  37. ctx: ctx,
  38. model: &Model{ModelPath: "foo"},
  39. opts: api.DefaultOptions(),
  40. successCh: make(chan *runnerRef, 1),
  41. errCh: make(chan error, 1),
  42. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  43. }
  44. // Fail to load model first
  45. s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  46. return nil, errors.New("something failed to load model blah")
  47. }
  48. gpus := discover.GpuInfoList{}
  49. s.load(req, f, gpus, 0)
  50. require.Empty(t, req.successCh)
  51. require.Len(t, req.errCh, 1)
  52. s.loadedMu.Lock()
  53. require.Empty(t, s.loaded)
  54. s.loadedMu.Unlock()
  55. err := <-req.errCh
  56. require.Contains(t, err.Error(), "this model may be incompatible")
  57. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  58. s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  59. return server, nil
  60. }
  61. s.load(req, f, gpus, 0)
  62. select {
  63. case err := <-req.errCh:
  64. require.NoError(t, err)
  65. case resp := <-req.successCh:
  66. require.Equal(t, uint64(10), resp.estimatedVRAM)
  67. require.Equal(t, uint(1), resp.refCount)
  68. s.loadedMu.Lock()
  69. require.Len(t, s.loaded, 1)
  70. s.loadedMu.Unlock()
  71. }
  72. req.model.ModelPath = "dummy_model_path"
  73. server.waitResp = errors.New("wait failure")
  74. s.load(req, f, gpus, 0)
  75. select {
  76. case err := <-req.errCh:
  77. require.Contains(t, err.Error(), "wait failure")
  78. case resp := <-req.successCh:
  79. t.Fatalf("unexpected success %v", resp)
  80. }
  81. s.loadedMu.Lock()
  82. runner := s.loaded["dummy_model_path"]
  83. s.loadedMu.Unlock()
  84. require.NotNil(t, runner)
  85. require.Equal(t, uint(0), runner.refCount)
  86. time.Sleep(1 * time.Millisecond)
  87. require.Len(t, s.expiredCh, 1)
  88. }
  89. type reqBundle struct {
  90. ctx context.Context //nolint:containedctx
  91. ctxDone func()
  92. srv *mockLlm
  93. req *LlmRequest
  94. f *ggml.GGML
  95. }
  96. func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  97. return scenario.srv, nil
  98. }
  99. func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
  100. b := &reqBundle{}
  101. b.ctx, b.ctxDone = context.WithCancel(ctx)
  102. t.Helper()
  103. f, err := os.CreateTemp(t.TempDir(), modelName)
  104. require.NoError(t, err)
  105. defer f.Close()
  106. require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
  107. "general.architecture": "llama",
  108. "llama.context_length": uint32(32),
  109. "llama.embedding_length": uint32(4096),
  110. "llama.block_count": uint32(1),
  111. "llama.attention.head_count": uint32(32),
  112. "llama.attention.head_count_kv": uint32(32),
  113. "tokenizer.ggml.tokens": []string{" "},
  114. "tokenizer.ggml.scores": []float32{0},
  115. "tokenizer.ggml.token_type": []int32{0},
  116. }, []ggml.Tensor{
  117. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  118. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  119. }))
  120. require.NoError(t, err)
  121. fname := f.Name()
  122. model := &Model{Name: modelName, ModelPath: fname}
  123. b.f, err = llm.LoadModel(model.ModelPath, 0)
  124. require.NoError(t, err)
  125. if duration == nil {
  126. duration = &api.Duration{Duration: 5 * time.Millisecond}
  127. }
  128. b.req = &LlmRequest{
  129. ctx: b.ctx,
  130. model: model,
  131. opts: api.DefaultOptions(),
  132. sessionDuration: duration,
  133. successCh: make(chan *runnerRef, 1),
  134. errCh: make(chan error, 1),
  135. }
  136. b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  137. return b
  138. }
  139. func getGpuFn() discover.GpuInfoList {
  140. g := discover.GpuInfo{Library: "metal"}
  141. g.TotalMemory = 24 * format.GigaByte
  142. g.FreeMemory = 12 * format.GigaByte
  143. return []discover.GpuInfo{g}
  144. }
  145. func getCpuFn() discover.GpuInfoList {
  146. g := discover.GpuInfo{Library: "cpu"}
  147. g.TotalMemory = 32 * format.GigaByte
  148. g.FreeMemory = 26 * format.GigaByte
  149. return []discover.GpuInfo{g}
  150. }
  151. func TestRequestsSameModelSameRequest(t *testing.T) {
  152. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  153. defer done()
  154. s := InitScheduler(ctx)
  155. s.getGpuFn = getGpuFn
  156. s.getCpuFn = getCpuFn
  157. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  158. b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
  159. b.req.model = a.req.model
  160. b.f = a.f
  161. s.newServerFn = a.newServer
  162. slog.Info("a")
  163. s.pendingReqCh <- a.req
  164. require.Len(t, s.pendingReqCh, 1)
  165. s.Run(ctx)
  166. select {
  167. case resp := <-a.req.successCh:
  168. require.Equal(t, resp.llama, a.srv)
  169. require.Empty(t, s.pendingReqCh)
  170. require.Empty(t, a.req.errCh)
  171. case err := <-a.req.errCh:
  172. t.Fatal(err.Error())
  173. case <-ctx.Done():
  174. t.Fatal("timeout")
  175. }
  176. // Same runner as first request due to not needing a reload
  177. s.newServerFn = b.newServer
  178. slog.Info("b")
  179. s.pendingReqCh <- b.req
  180. select {
  181. case resp := <-b.req.successCh:
  182. require.Equal(t, resp.llama, a.srv)
  183. require.Empty(t, s.pendingReqCh)
  184. require.Empty(t, b.req.errCh)
  185. case err := <-b.req.errCh:
  186. t.Fatal(err.Error())
  187. case <-ctx.Done():
  188. t.Fatal("timeout")
  189. }
  190. }
  191. func TestRequestsSimpleReloadSameModel(t *testing.T) {
  192. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  193. defer done()
  194. s := InitScheduler(ctx)
  195. s.getGpuFn = getGpuFn
  196. s.getCpuFn = getCpuFn
  197. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  198. b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
  199. tmpModel := *a.req.model
  200. b.req.model = &tmpModel
  201. b.f = a.f
  202. s.newServerFn = a.newServer
  203. slog.Info("a")
  204. s.pendingReqCh <- a.req
  205. require.Len(t, s.pendingReqCh, 1)
  206. s.Run(ctx)
  207. select {
  208. case resp := <-a.req.successCh:
  209. require.Equal(t, resp.llama, a.srv)
  210. require.Empty(t, s.pendingReqCh)
  211. require.Empty(t, a.req.errCh)
  212. case err := <-a.req.errCh:
  213. t.Fatal(err.Error())
  214. case <-ctx.Done():
  215. t.Fatal("timeout")
  216. }
  217. // Trigger a reload
  218. s.newServerFn = b.newServer
  219. b.req.model.AdapterPaths = []string{"new"}
  220. slog.Info("b")
  221. s.pendingReqCh <- b.req
  222. // finish first two requests, so model can reload
  223. time.Sleep(1 * time.Millisecond)
  224. a.ctxDone()
  225. select {
  226. case resp := <-b.req.successCh:
  227. require.Equal(t, resp.llama, b.srv)
  228. require.Empty(t, s.pendingReqCh)
  229. require.Empty(t, b.req.errCh)
  230. case err := <-b.req.errCh:
  231. t.Fatal(err.Error())
  232. case <-ctx.Done():
  233. t.Fatal("timeout")
  234. }
  235. }
  236. func TestRequestsMultipleLoadedModels(t *testing.T) {
  237. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  238. defer done()
  239. s := InitScheduler(ctx)
  240. s.getGpuFn = getGpuFn
  241. s.getCpuFn = getCpuFn
  242. // Multiple loaded models
  243. a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
  244. b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
  245. c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
  246. c.req.opts.NumGPU = 0 // CPU load, will be allowed
  247. d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
  248. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
  249. s.newServerFn = a.newServer
  250. slog.Info("a")
  251. s.pendingReqCh <- a.req
  252. s.Run(ctx)
  253. select {
  254. case resp := <-a.req.successCh:
  255. require.Equal(t, resp.llama, a.srv)
  256. require.Empty(t, s.pendingReqCh)
  257. require.Empty(t, a.req.errCh)
  258. case err := <-a.req.errCh:
  259. t.Fatal(err.Error())
  260. case <-ctx.Done():
  261. t.Fatal("timeout")
  262. }
  263. s.loadedMu.Lock()
  264. require.Len(t, s.loaded, 1)
  265. s.loadedMu.Unlock()
  266. t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
  267. s.newServerFn = b.newServer
  268. slog.Info("b")
  269. s.pendingReqCh <- b.req
  270. select {
  271. case resp := <-b.req.successCh:
  272. require.Equal(t, resp.llama, b.srv)
  273. require.Empty(t, s.pendingReqCh)
  274. require.Empty(t, b.req.errCh)
  275. case err := <-b.req.errCh:
  276. t.Fatal(err.Error())
  277. case <-ctx.Done():
  278. t.Fatal("timeout")
  279. }
  280. s.loadedMu.Lock()
  281. require.Len(t, s.loaded, 2)
  282. s.loadedMu.Unlock()
  283. // This is a CPU load with NumGPU = 0 so it should load
  284. s.newServerFn = c.newServer
  285. slog.Info("c")
  286. s.pendingReqCh <- c.req
  287. select {
  288. case resp := <-c.req.successCh:
  289. require.Equal(t, resp.llama, c.srv)
  290. require.Empty(t, s.pendingReqCh)
  291. require.Empty(t, c.req.errCh)
  292. case err := <-c.req.errCh:
  293. t.Fatal(err.Error())
  294. case <-ctx.Done():
  295. t.Fatal("timeout")
  296. }
  297. s.loadedMu.Lock()
  298. require.Len(t, s.loaded, 3)
  299. s.loadedMu.Unlock()
  300. // Try to load a model that won't fit
  301. s.newServerFn = d.newServer
  302. slog.Info("d")
  303. s.loadedMu.Lock()
  304. require.Len(t, s.loaded, 3)
  305. s.loadedMu.Unlock()
  306. a.ctxDone() // Won't help since this one isn't big enough to make room
  307. time.Sleep(2 * time.Millisecond)
  308. s.pendingReqCh <- d.req
  309. // finish prior request, so new model can load
  310. time.Sleep(6 * time.Millisecond)
  311. s.loadedMu.Lock()
  312. require.Len(t, s.loaded, 2)
  313. s.loadedMu.Unlock()
  314. b.ctxDone()
  315. select {
  316. case resp := <-d.req.successCh:
  317. require.Equal(t, resp.llama, d.srv)
  318. require.Empty(t, s.pendingReqCh)
  319. require.Empty(t, d.req.errCh)
  320. case <-ctx.Done():
  321. t.Fatal("timeout")
  322. }
  323. s.loadedMu.Lock()
  324. require.Len(t, s.loaded, 2)
  325. s.loadedMu.Unlock()
  326. }
  327. func TestGetRunner(t *testing.T) {
  328. ctx, done := context.WithTimeout(context.Background(), 200*time.Millisecond)
  329. defer done()
  330. a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
  331. b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
  332. c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
  333. t.Setenv("OLLAMA_MAX_QUEUE", "1")
  334. s := InitScheduler(ctx)
  335. s.getGpuFn = getGpuFn
  336. s.getCpuFn = getCpuFn
  337. s.newServerFn = a.newServer
  338. slog.Info("a")
  339. successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
  340. require.Len(t, s.pendingReqCh, 1)
  341. slog.Info("b")
  342. successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
  343. require.Len(t, s.pendingReqCh, 1)
  344. require.Empty(t, successCh1b)
  345. require.Len(t, errCh1b, 1)
  346. err := <-errCh1b
  347. require.Contains(t, err.Error(), "server busy")
  348. s.Run(ctx)
  349. select {
  350. case resp := <-successCh1a:
  351. require.Equal(t, resp.llama, a.srv)
  352. require.Empty(t, s.pendingReqCh)
  353. require.Empty(t, errCh1a)
  354. case err := <-errCh1a:
  355. t.Fatal(err.Error())
  356. case <-ctx.Done():
  357. t.Fatal("timeout")
  358. }
  359. a.ctxDone() // Set "a" model to idle so it can unload
  360. s.loadedMu.Lock()
  361. require.Len(t, s.loaded, 1)
  362. s.loadedMu.Unlock()
  363. c.req.model.ModelPath = "bad path"
  364. slog.Info("c")
  365. successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
  366. // Starts in pending channel, then should be quickly processed to return an error
  367. time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
  368. require.Empty(t, successCh1c)
  369. s.loadedMu.Lock()
  370. require.Empty(t, s.loaded)
  371. s.loadedMu.Unlock()
  372. require.Len(t, errCh1c, 1)
  373. err = <-errCh1c
  374. require.Contains(t, err.Error(), "bad path")
  375. b.ctxDone()
  376. }
  377. func TestExpireRunner(t *testing.T) {
  378. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  379. defer done()
  380. s := InitScheduler(ctx)
  381. req := &LlmRequest{
  382. ctx: ctx,
  383. model: &Model{ModelPath: "foo"},
  384. opts: api.DefaultOptions(),
  385. successCh: make(chan *runnerRef, 1),
  386. errCh: make(chan error, 1),
  387. sessionDuration: &api.Duration{Duration: 2 * time.Minute},
  388. }
  389. var f *ggml.GGML
  390. gpus := discover.GpuInfoList{}
  391. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  392. s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  393. return server, nil
  394. }
  395. s.load(req, f, gpus, 0)
  396. select {
  397. case err := <-req.errCh:
  398. if err != nil {
  399. t.Fatalf("expected no errors when loading, got '%s'", err.Error())
  400. }
  401. case resp := <-req.successCh:
  402. s.loadedMu.Lock()
  403. if resp.refCount != uint(1) || len(s.loaded) != 1 {
  404. t.Fatalf("expected a model to be loaded")
  405. }
  406. s.loadedMu.Unlock()
  407. }
  408. s.expireRunner(&Model{ModelPath: "foo"})
  409. s.finishedReqCh <- req
  410. s.processCompleted(ctx)
  411. s.loadedMu.Lock()
  412. if len(s.loaded) != 0 {
  413. t.Fatalf("expected model to be unloaded")
  414. }
  415. s.loadedMu.Unlock()
  416. }
  417. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  418. func TestPrematureExpired(t *testing.T) {
  419. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  420. defer done()
  421. // Same model, same request
  422. scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
  423. s := InitScheduler(ctx)
  424. s.getGpuFn = func() discover.GpuInfoList {
  425. g := discover.GpuInfo{Library: "metal"}
  426. g.TotalMemory = 24 * format.GigaByte
  427. g.FreeMemory = 12 * format.GigaByte
  428. return []discover.GpuInfo{g}
  429. }
  430. s.newServerFn = scenario1a.newServer
  431. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  432. require.Len(t, s.pendingReqCh, 1)
  433. s.Run(ctx)
  434. select {
  435. case resp := <-successCh1a:
  436. require.Equal(t, resp.llama, scenario1a.srv)
  437. require.Empty(t, s.pendingReqCh)
  438. require.Empty(t, errCh1a)
  439. s.loadedMu.Lock()
  440. require.Len(t, s.loaded, 1)
  441. s.loadedMu.Unlock()
  442. slog.Info("sending premature expired event now")
  443. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  444. case err := <-errCh1a:
  445. t.Fatal(err.Error())
  446. case <-ctx.Done():
  447. t.Fatal("timeout")
  448. }
  449. time.Sleep(scenario1a.req.sessionDuration.Duration)
  450. scenario1a.ctxDone()
  451. time.Sleep(20 * time.Millisecond)
  452. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  453. time.Sleep(10 * time.Millisecond)
  454. require.Empty(t, s.finishedReqCh)
  455. s.loadedMu.Lock()
  456. require.Empty(t, s.loaded)
  457. s.loadedMu.Unlock()
  458. // also shouldn't happen in real life
  459. s.finishedReqCh <- scenario1a.req
  460. time.Sleep(5 * time.Millisecond)
  461. }
  462. func TestUseLoadedRunner(t *testing.T) {
  463. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  464. req := &LlmRequest{
  465. ctx: ctx,
  466. opts: api.DefaultOptions(),
  467. successCh: make(chan *runnerRef, 1),
  468. sessionDuration: &api.Duration{Duration: 2},
  469. }
  470. finished := make(chan *LlmRequest)
  471. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  472. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  473. req.useLoadedRunner(r1, finished)
  474. require.Equal(t, uint(1), r1.refCount)
  475. require.Equal(t, time.Duration(2), r1.sessionDuration)
  476. select {
  477. case success := <-req.successCh:
  478. require.Equal(t, r1, success)
  479. case err := <-req.errCh:
  480. t.Fatal(err.Error())
  481. case <-ctx.Done():
  482. t.Fatal("timeout")
  483. }
  484. done()
  485. fin := <-finished
  486. require.Equal(t, req, fin)
  487. }
  488. func TestUpdateFreeSpace(t *testing.T) {
  489. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  490. defer done()
  491. gpus := discover.GpuInfoList{
  492. {
  493. Library: "a",
  494. ID: "1",
  495. },
  496. {
  497. Library: "a",
  498. ID: "2",
  499. },
  500. }
  501. gpus[0].TotalMemory = 1000
  502. gpus[0].FreeMemory = 900
  503. gpus[1].TotalMemory = 2000
  504. gpus[1].FreeMemory = 1900
  505. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  506. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  507. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  508. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  509. s := InitScheduler(ctx)
  510. s.loadedMu.Lock()
  511. s.loaded["a"] = r1
  512. s.loaded["b"] = r2
  513. s.loadedMu.Unlock()
  514. s.updateFreeSpace(gpus)
  515. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  516. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  517. }
  518. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  519. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  520. defer done()
  521. gpus := discover.GpuInfoList{
  522. {
  523. Library: "cuda",
  524. ID: "0",
  525. },
  526. {
  527. Library: "cuda",
  528. ID: "1",
  529. },
  530. }
  531. r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
  532. s := InitScheduler(ctx)
  533. s.loadedMu.Lock()
  534. s.loaded["a"] = r1
  535. s.loadedMu.Unlock()
  536. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  537. require.Len(t, tmp, 1)
  538. require.Equal(t, "1", tmp[0].ID)
  539. r1.gpus = discover.GpuInfoList{gpus[1]}
  540. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  541. require.Len(t, tmp, 1)
  542. require.Equal(t, "0", tmp[0].ID)
  543. r1.gpus = discover.GpuInfoList{}
  544. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  545. require.Len(t, tmp, 2)
  546. }
  547. func TestFindRunnerToUnload(t *testing.T) {
  548. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  549. defer done()
  550. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  551. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  552. s := InitScheduler(ctx)
  553. s.loadedMu.Lock()
  554. s.loaded["a"] = r1
  555. s.loaded["b"] = r2
  556. s.loadedMu.Unlock()
  557. resp := s.findRunnerToUnload()
  558. require.Equal(t, r2, resp)
  559. r2.refCount = 1
  560. resp = s.findRunnerToUnload()
  561. require.Equal(t, r1, resp)
  562. }
  563. func TestNeedsReload(t *testing.T) {
  564. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  565. defer done()
  566. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  567. do := api.DefaultOptions()
  568. runner := &runnerRef{
  569. model: &Model{
  570. AdapterPaths: []string{"adapter1"},
  571. ProjectorPaths: []string{"projector1"},
  572. },
  573. Options: &do,
  574. llama: llm,
  575. numParallel: 1,
  576. }
  577. req := &LlmRequest{
  578. model: &Model{
  579. AdapterPaths: []string{"adapter2"},
  580. ProjectorPaths: []string{"projector2"},
  581. },
  582. opts: api.DefaultOptions(),
  583. }
  584. resp := runner.needsReload(ctx, req)
  585. require.True(t, resp)
  586. req.model.AdapterPaths = runner.model.AdapterPaths
  587. resp = runner.needsReload(ctx, req)
  588. require.True(t, resp)
  589. req.model.ProjectorPaths = runner.model.ProjectorPaths
  590. runner.loading = true
  591. req.opts.NumBatch = 1234
  592. resp = runner.needsReload(ctx, req)
  593. require.True(t, resp)
  594. req.opts.NumBatch = runner.Options.NumBatch
  595. llm.pingResp = errors.New("foo")
  596. resp = runner.needsReload(ctx, req)
  597. require.True(t, resp)
  598. llm.pingResp = nil
  599. resp = runner.needsReload(ctx, req)
  600. require.False(t, resp)
  601. req.opts.NumGPU = 99
  602. resp = runner.needsReload(ctx, req)
  603. require.True(t, resp)
  604. req.opts.NumGPU = -1
  605. resp = runner.needsReload(ctx, req)
  606. require.False(t, resp)
  607. }
  608. func TestUnloadAllRunners(t *testing.T) {
  609. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  610. defer done()
  611. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  612. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  613. s := InitScheduler(ctx)
  614. s.unloadAllRunners()
  615. r1 := &runnerRef{llama: llm1, numParallel: 1}
  616. r2 := &runnerRef{llama: llm2, numParallel: 1}
  617. s.loadedMu.Lock()
  618. s.loaded["a"] = r1
  619. s.loaded["b"] = r2
  620. s.loadedMu.Unlock()
  621. s.unloadAllRunners()
  622. require.True(t, llm1.closeCalled)
  623. require.True(t, llm2.closeCalled)
  624. }
  625. func TestUnload(t *testing.T) {
  626. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  627. r1 := &runnerRef{llama: llm1, numParallel: 1}
  628. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  629. r1.unload()
  630. require.True(t, llm1.closeCalled)
  631. r2.unload()
  632. require.Nil(t, r2.model)
  633. }
  634. func TestAlreadyCanceled(t *testing.T) {
  635. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  636. defer done()
  637. dctx, done2 := context.WithCancel(ctx)
  638. done2()
  639. scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
  640. s := InitScheduler(ctx)
  641. slog.Info("scenario1a")
  642. s.pendingReqCh <- scenario1a.req
  643. require.Len(t, s.pendingReqCh, 1)
  644. s.Run(ctx)
  645. time.Sleep(5 * time.Millisecond)
  646. require.Empty(t, s.pendingReqCh)
  647. require.Empty(t, scenario1a.req.errCh)
  648. require.Empty(t, scenario1a.req.successCh)
  649. }
  650. func TestHomogeneousGPUs(t *testing.T) {
  651. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  652. defer done()
  653. s := InitScheduler(ctx)
  654. s.getGpuFn = func() discover.GpuInfoList {
  655. // Set memory values to require the model to be spread
  656. gpus := []discover.GpuInfo{
  657. {Library: "cuda"},
  658. {Library: "rocm"},
  659. }
  660. gpus[0].TotalMemory = 1 * format.GibiByte
  661. gpus[0].FreeMemory = 256 * format.MebiByte
  662. gpus[1].TotalMemory = 1 * format.GibiByte
  663. gpus[1].FreeMemory = 256 * format.MebiByte
  664. return gpus
  665. }
  666. s.getCpuFn = getCpuFn
  667. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  668. s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  669. require.Len(t, gpus, 1)
  670. return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
  671. }
  672. slog.Info("a")
  673. s.pendingReqCh <- a.req
  674. require.Len(t, s.pendingReqCh, 1)
  675. s.Run(ctx)
  676. select {
  677. case resp := <-a.req.successCh:
  678. require.Equal(t, resp.llama, a.srv)
  679. require.Empty(t, s.pendingReqCh)
  680. require.Empty(t, a.req.errCh)
  681. case err := <-a.req.errCh:
  682. t.Fatal(err.Error())
  683. case <-ctx.Done():
  684. t.Fatal("timeout")
  685. }
  686. }
  687. type mockLlm struct {
  688. pingResp error
  689. waitResp error
  690. completionResp error
  691. embeddingResp []float32
  692. embeddingRespErr error
  693. tokenizeResp []int
  694. tokenizeRespErr error
  695. detokenizeResp string
  696. detonekizeRespErr error
  697. closeResp error
  698. closeCalled bool
  699. estimatedVRAM uint64
  700. estimatedTotal uint64
  701. estimatedVRAMByGPU map[string]uint64
  702. }
  703. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  704. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  705. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  706. return s.completionResp
  707. }
  708. func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
  709. return s.embeddingResp, s.embeddingRespErr
  710. }
  711. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  712. return s.tokenizeResp, s.tokenizeRespErr
  713. }
  714. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  715. return s.detokenizeResp, s.detonekizeRespErr
  716. }
  717. func (s *mockLlm) Close() error {
  718. s.closeCalled = true
  719. return s.closeResp
  720. }
  721. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  722. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  723. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }