sched_test.go 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "runtime"
  10. "testing"
  11. "time"
  12. "github.com/ollama/ollama/api"
  13. "github.com/ollama/ollama/app/lifecycle"
  14. "github.com/ollama/ollama/envconfig"
  15. "github.com/ollama/ollama/format"
  16. "github.com/ollama/ollama/gpu"
  17. "github.com/ollama/ollama/llm"
  18. "github.com/stretchr/testify/require"
  19. )
  20. func init() {
  21. os.Setenv("OLLAMA_DEBUG", "1")
  22. lifecycle.InitLogging()
  23. }
  24. func TestInitScheduler(t *testing.T) {
  25. ctx, done := context.WithCancel(context.Background())
  26. defer done()
  27. s := InitScheduler(ctx)
  28. s.loadedMu.Lock()
  29. require.NotNil(t, s.loaded)
  30. s.loadedMu.Unlock()
  31. }
  32. func TestLoad(t *testing.T) {
  33. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  34. defer done()
  35. s := InitScheduler(ctx)
  36. var ggml *llm.GGML // value not used in tests
  37. req := &LlmRequest{
  38. ctx: ctx,
  39. model: &Model{ModelPath: "foo"},
  40. opts: api.DefaultOptions(),
  41. successCh: make(chan *runnerRef, 1),
  42. errCh: make(chan error, 1),
  43. sessionDuration: &api.Duration{Duration: 2 * time.Second},
  44. }
  45. // Fail to load model first
  46. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  47. return nil, fmt.Errorf("something failed to load model blah")
  48. }
  49. gpus := gpu.GpuInfoList{}
  50. s.load(req, ggml, gpus, 0)
  51. require.Empty(t, req.successCh)
  52. require.Len(t, req.errCh, 1)
  53. s.loadedMu.Lock()
  54. require.Empty(t, s.loaded)
  55. s.loadedMu.Unlock()
  56. err := <-req.errCh
  57. require.Contains(t, err.Error(), "this model may be incompatible")
  58. server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
  59. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  60. return server, nil
  61. }
  62. s.load(req, ggml, gpus, 0)
  63. select {
  64. case err := <-req.errCh:
  65. require.NoError(t, err)
  66. case resp := <-req.successCh:
  67. require.Equal(t, uint64(10), resp.estimatedVRAM)
  68. require.Equal(t, uint(1), resp.refCount)
  69. s.loadedMu.Lock()
  70. require.Len(t, s.loaded, 1)
  71. s.loadedMu.Unlock()
  72. }
  73. req.model.ModelPath = "dummy_model_path"
  74. server.waitResp = fmt.Errorf("wait failure")
  75. s.load(req, ggml, gpus, 0)
  76. select {
  77. case err := <-req.errCh:
  78. require.Contains(t, err.Error(), "wait failure")
  79. case resp := <-req.successCh:
  80. t.Fatalf("unexpected success %v", resp)
  81. }
  82. s.loadedMu.Lock()
  83. runner := s.loaded["dummy_model_path"]
  84. s.loadedMu.Unlock()
  85. require.NotNil(t, runner)
  86. require.Equal(t, uint(0), runner.refCount)
  87. time.Sleep(1 * time.Millisecond)
  88. require.Len(t, s.expiredCh, 1)
  89. }
  90. type reqBundle struct {
  91. ctx context.Context //nolint:containedctx
  92. ctxDone func()
  93. srv *mockLlm
  94. req *LlmRequest
  95. ggml *llm.GGML
  96. }
  97. func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
  98. return scenario.srv, nil
  99. }
  100. func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
  101. b := &reqBundle{}
  102. b.ctx, b.ctxDone = context.WithCancel(ctx)
  103. t.Helper()
  104. f, err := os.CreateTemp(t.TempDir(), modelName)
  105. require.NoError(t, err)
  106. defer f.Close()
  107. gguf := llm.NewGGUFV3(binary.LittleEndian)
  108. err = gguf.Encode(f, llm.KV{
  109. "general.architecture": "llama",
  110. "general.name": "name",
  111. "llama.context_length": uint32(32),
  112. "llama.embedding_length": uint32(4096),
  113. "llama.block_count": uint32(1),
  114. "llama.attention.head_count": uint32(32),
  115. "llama.attention.head_count_kv": uint32(32),
  116. "tokenizer.ggml.tokens": []string{" "},
  117. "tokenizer.ggml.scores": []float32{0},
  118. "tokenizer.ggml.token_type": []int32{0},
  119. }, []llm.Tensor{
  120. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  121. {Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
  122. })
  123. require.NoError(t, err)
  124. fname := f.Name()
  125. model := &Model{Name: modelName, ModelPath: fname}
  126. b.ggml, err = llm.LoadModel(model.ModelPath, 0)
  127. require.NoError(t, err)
  128. if duration == nil {
  129. duration = &api.Duration{Duration: 5 * time.Millisecond}
  130. }
  131. b.req = &LlmRequest{
  132. ctx: b.ctx,
  133. model: model,
  134. opts: api.DefaultOptions(),
  135. sessionDuration: duration,
  136. successCh: make(chan *runnerRef, 1),
  137. errCh: make(chan error, 1),
  138. }
  139. b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
  140. return b
  141. }
  142. func getGpuFn() gpu.GpuInfoList {
  143. g := gpu.GpuInfo{Library: "metal"}
  144. g.TotalMemory = 24 * format.GigaByte
  145. g.FreeMemory = 12 * format.GigaByte
  146. return []gpu.GpuInfo{g}
  147. }
  148. func getCpuFn() gpu.GpuInfoList {
  149. g := gpu.GpuInfo{Library: "cpu"}
  150. g.TotalMemory = 32 * format.GigaByte
  151. g.FreeMemory = 26 * format.GigaByte
  152. return []gpu.GpuInfo{g}
  153. }
  154. func TestRequestsSameModelSameRequest(t *testing.T) {
  155. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  156. defer done()
  157. s := InitScheduler(ctx)
  158. s.getGpuFn = getGpuFn
  159. s.getCpuFn = getCpuFn
  160. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  161. b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
  162. b.req.model = a.req.model
  163. b.ggml = a.ggml
  164. s.newServerFn = a.newServer
  165. slog.Info("a")
  166. s.pendingReqCh <- a.req
  167. require.Len(t, s.pendingReqCh, 1)
  168. s.Run(ctx)
  169. select {
  170. case resp := <-a.req.successCh:
  171. require.Equal(t, resp.llama, a.srv)
  172. require.Empty(t, s.pendingReqCh)
  173. require.Empty(t, a.req.errCh)
  174. case err := <-a.req.errCh:
  175. t.Fatal(err.Error())
  176. case <-ctx.Done():
  177. t.Fatal("timeout")
  178. }
  179. // Same runner as first request due to not needing a reload
  180. s.newServerFn = b.newServer
  181. slog.Info("b")
  182. s.pendingReqCh <- b.req
  183. select {
  184. case resp := <-b.req.successCh:
  185. require.Equal(t, resp.llama, a.srv)
  186. require.Empty(t, s.pendingReqCh)
  187. require.Empty(t, b.req.errCh)
  188. case err := <-b.req.errCh:
  189. t.Fatal(err.Error())
  190. case <-ctx.Done():
  191. t.Fatal("timeout")
  192. }
  193. }
  194. func TestRequestsSimpleReloadSameModel(t *testing.T) {
  195. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  196. defer done()
  197. s := InitScheduler(ctx)
  198. s.getGpuFn = getGpuFn
  199. s.getCpuFn = getCpuFn
  200. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  201. b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
  202. tmpModel := *a.req.model
  203. b.req.model = &tmpModel
  204. b.ggml = a.ggml
  205. s.newServerFn = a.newServer
  206. slog.Info("a")
  207. s.pendingReqCh <- a.req
  208. require.Len(t, s.pendingReqCh, 1)
  209. s.Run(ctx)
  210. select {
  211. case resp := <-a.req.successCh:
  212. require.Equal(t, resp.llama, a.srv)
  213. require.Empty(t, s.pendingReqCh)
  214. require.Empty(t, a.req.errCh)
  215. case err := <-a.req.errCh:
  216. t.Fatal(err.Error())
  217. case <-ctx.Done():
  218. t.Fatal("timeout")
  219. }
  220. // Trigger a reload
  221. s.newServerFn = b.newServer
  222. b.req.model.AdapterPaths = []string{"new"}
  223. slog.Info("b")
  224. s.pendingReqCh <- b.req
  225. // finish first two requests, so model can reload
  226. time.Sleep(1 * time.Millisecond)
  227. a.ctxDone()
  228. select {
  229. case resp := <-b.req.successCh:
  230. require.Equal(t, resp.llama, b.srv)
  231. require.Empty(t, s.pendingReqCh)
  232. require.Empty(t, b.req.errCh)
  233. case err := <-b.req.errCh:
  234. t.Fatal(err.Error())
  235. case <-ctx.Done():
  236. t.Fatal("timeout")
  237. }
  238. }
  239. func TestRequestsMultipleLoadedModels(t *testing.T) {
  240. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  241. defer done()
  242. s := InitScheduler(ctx)
  243. s.getGpuFn = getGpuFn
  244. s.getCpuFn = getCpuFn
  245. // Multiple loaded models
  246. a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
  247. b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
  248. c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
  249. c.req.opts.NumGPU = 0 // CPU load, will be allowed
  250. d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
  251. envconfig.MaxRunners = 1
  252. s.newServerFn = a.newServer
  253. slog.Info("a")
  254. s.pendingReqCh <- a.req
  255. s.Run(ctx)
  256. select {
  257. case resp := <-a.req.successCh:
  258. require.Equal(t, resp.llama, a.srv)
  259. require.Empty(t, s.pendingReqCh)
  260. require.Empty(t, a.req.errCh)
  261. case err := <-a.req.errCh:
  262. t.Fatal(err.Error())
  263. case <-ctx.Done():
  264. t.Fatal("timeout")
  265. }
  266. s.loadedMu.Lock()
  267. require.Len(t, s.loaded, 1)
  268. s.loadedMu.Unlock()
  269. envconfig.MaxRunners = 0
  270. s.newServerFn = b.newServer
  271. slog.Info("b")
  272. s.pendingReqCh <- b.req
  273. select {
  274. case resp := <-b.req.successCh:
  275. require.Equal(t, resp.llama, b.srv)
  276. require.Empty(t, s.pendingReqCh)
  277. require.Empty(t, b.req.errCh)
  278. case err := <-b.req.errCh:
  279. t.Fatal(err.Error())
  280. case <-ctx.Done():
  281. t.Fatal("timeout")
  282. }
  283. s.loadedMu.Lock()
  284. require.Len(t, s.loaded, 2)
  285. s.loadedMu.Unlock()
  286. // This is a CPU load with NumGPU = 0 so it should load
  287. s.newServerFn = c.newServer
  288. slog.Info("c")
  289. s.pendingReqCh <- c.req
  290. select {
  291. case resp := <-c.req.successCh:
  292. require.Equal(t, resp.llama, c.srv)
  293. require.Empty(t, s.pendingReqCh)
  294. require.Empty(t, c.req.errCh)
  295. case err := <-c.req.errCh:
  296. t.Fatal(err.Error())
  297. case <-ctx.Done():
  298. t.Fatal("timeout")
  299. }
  300. s.loadedMu.Lock()
  301. require.Len(t, s.loaded, 3)
  302. s.loadedMu.Unlock()
  303. // Try to load a model that wont fit
  304. s.newServerFn = d.newServer
  305. slog.Info("d")
  306. s.loadedMu.Lock()
  307. require.Len(t, s.loaded, 3)
  308. s.loadedMu.Unlock()
  309. a.ctxDone() // Won't help since this one isn't big enough to make room
  310. time.Sleep(2 * time.Millisecond)
  311. s.pendingReqCh <- d.req
  312. // finish prior request, so new model can load
  313. time.Sleep(6 * time.Millisecond)
  314. s.loadedMu.Lock()
  315. require.Len(t, s.loaded, 2)
  316. s.loadedMu.Unlock()
  317. b.ctxDone()
  318. select {
  319. case resp := <-d.req.successCh:
  320. require.Equal(t, resp.llama, d.srv)
  321. require.Empty(t, s.pendingReqCh)
  322. require.Empty(t, d.req.errCh)
  323. case <-ctx.Done():
  324. t.Fatal("timeout")
  325. }
  326. s.loadedMu.Lock()
  327. require.Len(t, s.loaded, 2)
  328. s.loadedMu.Unlock()
  329. }
  330. func TestRequestsModelTooBigForSystem(t *testing.T) {
  331. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  332. defer done()
  333. s := InitScheduler(ctx)
  334. s.getGpuFn = func() gpu.GpuInfoList {
  335. g := gpu.GpuInfo{Library: "metal"}
  336. g.TotalMemory = 4 * format.MebiByte
  337. g.FreeMemory = 3 * format.MebiByte
  338. return []gpu.GpuInfo{g}
  339. }
  340. s.getCpuFn = func() gpu.GpuInfoList {
  341. g := gpu.GpuInfo{Library: "cpu"}
  342. g.TotalMemory = 4 * format.MebiByte
  343. g.FreeMemory = 2 * format.MebiByte
  344. return []gpu.GpuInfo{g}
  345. }
  346. a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
  347. s.newServerFn = a.newServer
  348. slog.Info("a")
  349. s.pendingReqCh <- a.req
  350. require.Len(t, s.pendingReqCh, 1)
  351. s.Run(ctx)
  352. select {
  353. case <-a.req.successCh:
  354. if runtime.GOOS == "linux" {
  355. t.Fatal("request should have been rejected with out of space")
  356. }
  357. // else - Darwin and Windows don't reject right now
  358. case err := <-a.req.errCh:
  359. require.Contains(t, err.Error(), "too large")
  360. case <-ctx.Done():
  361. t.Fatal("timeout")
  362. }
  363. }
  364. func TestGetRunner(t *testing.T) {
  365. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  366. defer done()
  367. a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
  368. b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
  369. c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
  370. envconfig.MaxQueuedRequests = 1
  371. s := InitScheduler(ctx)
  372. s.getGpuFn = getGpuFn
  373. s.getCpuFn = getCpuFn
  374. s.newServerFn = a.newServer
  375. slog.Info("a")
  376. successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
  377. require.Len(t, s.pendingReqCh, 1)
  378. slog.Info("b")
  379. successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
  380. require.Len(t, s.pendingReqCh, 1)
  381. require.Empty(t, successCh1b)
  382. require.Len(t, errCh1b, 1)
  383. err := <-errCh1b
  384. require.Contains(t, err.Error(), "server busy")
  385. s.Run(ctx)
  386. select {
  387. case resp := <-successCh1a:
  388. require.Equal(t, resp.llama, a.srv)
  389. require.Empty(t, s.pendingReqCh)
  390. require.Empty(t, errCh1a)
  391. case err := <-errCh1a:
  392. t.Fatal(err.Error())
  393. case <-ctx.Done():
  394. t.Fatal("timeout")
  395. }
  396. a.ctxDone() // Set "a" model to idle so it can unload
  397. s.loadedMu.Lock()
  398. require.Len(t, s.loaded, 1)
  399. s.loadedMu.Unlock()
  400. c.req.model.ModelPath = "bad path"
  401. slog.Info("c")
  402. successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
  403. // Starts in pending channel, then should be quickly processsed to return an error
  404. time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
  405. require.Empty(t, successCh1c)
  406. s.loadedMu.Lock()
  407. require.Empty(t, s.loaded)
  408. s.loadedMu.Unlock()
  409. require.Len(t, errCh1c, 1)
  410. err = <-errCh1c
  411. require.Contains(t, err.Error(), "bad path")
  412. b.ctxDone()
  413. }
  414. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  415. func TestPrematureExpired(t *testing.T) {
  416. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  417. defer done()
  418. // Same model, same request
  419. scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
  420. s := InitScheduler(ctx)
  421. s.getGpuFn = func() gpu.GpuInfoList {
  422. g := gpu.GpuInfo{Library: "metal"}
  423. g.TotalMemory = 24 * format.GigaByte
  424. g.FreeMemory = 12 * format.GigaByte
  425. return []gpu.GpuInfo{g}
  426. }
  427. s.newServerFn = scenario1a.newServer
  428. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  429. require.Len(t, s.pendingReqCh, 1)
  430. s.Run(ctx)
  431. select {
  432. case resp := <-successCh1a:
  433. require.Equal(t, resp.llama, scenario1a.srv)
  434. require.Empty(t, s.pendingReqCh)
  435. require.Empty(t, errCh1a)
  436. s.loadedMu.Lock()
  437. require.Len(t, s.loaded, 1)
  438. s.loadedMu.Unlock()
  439. slog.Info("sending premature expired event now")
  440. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  441. case err := <-errCh1a:
  442. t.Fatal(err.Error())
  443. case <-ctx.Done():
  444. t.Fatal("timeout")
  445. }
  446. time.Sleep(scenario1a.req.sessionDuration.Duration)
  447. scenario1a.ctxDone()
  448. time.Sleep(20 * time.Millisecond)
  449. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  450. time.Sleep(10 * time.Millisecond)
  451. require.Empty(t, s.finishedReqCh)
  452. s.loadedMu.Lock()
  453. require.Empty(t, s.loaded)
  454. s.loadedMu.Unlock()
  455. // also shouldn't happen in real life
  456. s.finishedReqCh <- scenario1a.req
  457. time.Sleep(5 * time.Millisecond)
  458. }
  459. func TestUseLoadedRunner(t *testing.T) {
  460. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  461. req := &LlmRequest{
  462. ctx: ctx,
  463. opts: api.DefaultOptions(),
  464. successCh: make(chan *runnerRef, 1),
  465. sessionDuration: &api.Duration{Duration: 2},
  466. }
  467. finished := make(chan *LlmRequest)
  468. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  469. r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
  470. req.useLoadedRunner(r1, finished)
  471. require.Equal(t, uint(1), r1.refCount)
  472. require.Equal(t, time.Duration(2), r1.sessionDuration)
  473. select {
  474. case success := <-req.successCh:
  475. require.Equal(t, r1, success)
  476. case err := <-req.errCh:
  477. t.Fatal(err.Error())
  478. case <-ctx.Done():
  479. t.Fatal("timeout")
  480. }
  481. done()
  482. fin := <-finished
  483. require.Equal(t, req, fin)
  484. }
  485. func TestUpdateFreeSpace(t *testing.T) {
  486. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  487. defer done()
  488. gpus := gpu.GpuInfoList{
  489. {
  490. Library: "a",
  491. ID: "1",
  492. },
  493. {
  494. Library: "a",
  495. ID: "2",
  496. },
  497. }
  498. gpus[0].TotalMemory = 1000
  499. gpus[0].FreeMemory = 900
  500. gpus[1].TotalMemory = 2000
  501. gpus[1].FreeMemory = 1900
  502. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
  503. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
  504. r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
  505. r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
  506. s := InitScheduler(ctx)
  507. s.loadedMu.Lock()
  508. s.loaded["a"] = r1
  509. s.loaded["b"] = r2
  510. s.loadedMu.Unlock()
  511. s.updateFreeSpace(gpus)
  512. require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
  513. require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
  514. }
  515. func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
  516. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  517. defer done()
  518. gpus := gpu.GpuInfoList{
  519. {
  520. Library: "cuda",
  521. ID: "0",
  522. },
  523. {
  524. Library: "cuda",
  525. ID: "1",
  526. },
  527. }
  528. r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
  529. s := InitScheduler(ctx)
  530. s.loadedMu.Lock()
  531. s.loaded["a"] = r1
  532. s.loadedMu.Unlock()
  533. tmp := s.filterGPUsWithoutLoadingModels(gpus)
  534. require.Len(t, tmp, 1)
  535. require.Equal(t, "1", tmp[0].ID)
  536. r1.gpus = gpu.GpuInfoList{gpus[1]}
  537. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  538. require.Len(t, tmp, 1)
  539. require.Equal(t, "0", tmp[0].ID)
  540. r1.gpus = gpu.GpuInfoList{}
  541. tmp = s.filterGPUsWithoutLoadingModels(gpus)
  542. require.Len(t, tmp, 2)
  543. }
  544. func TestFindRunnerToUnload(t *testing.T) {
  545. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  546. defer done()
  547. r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
  548. r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
  549. s := InitScheduler(ctx)
  550. s.loadedMu.Lock()
  551. s.loaded["a"] = r1
  552. s.loaded["b"] = r2
  553. s.loadedMu.Unlock()
  554. resp := s.findRunnerToUnload()
  555. require.Equal(t, r2, resp)
  556. r2.refCount = 1
  557. resp = s.findRunnerToUnload()
  558. require.Equal(t, r1, resp)
  559. }
  560. func TestNeedsReload(t *testing.T) {
  561. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  562. defer done()
  563. llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  564. do := api.DefaultOptions()
  565. runner := &runnerRef{
  566. model: &Model{
  567. AdapterPaths: []string{"adapter1"},
  568. ProjectorPaths: []string{"projector1"},
  569. },
  570. Options: &do,
  571. llama: llm,
  572. numParallel: 1,
  573. }
  574. req := &LlmRequest{
  575. model: &Model{
  576. AdapterPaths: []string{"adapter2"},
  577. ProjectorPaths: []string{"projector2"},
  578. },
  579. opts: api.DefaultOptions(),
  580. }
  581. resp := runner.needsReload(ctx, req)
  582. require.True(t, resp)
  583. req.model.AdapterPaths = runner.model.AdapterPaths
  584. resp = runner.needsReload(ctx, req)
  585. require.True(t, resp)
  586. req.model.ProjectorPaths = runner.model.ProjectorPaths
  587. runner.loading = true
  588. req.opts.NumBatch = 1234
  589. resp = runner.needsReload(ctx, req)
  590. require.True(t, resp)
  591. req.opts.NumBatch = runner.Options.NumBatch
  592. llm.pingResp = fmt.Errorf("foo")
  593. resp = runner.needsReload(ctx, req)
  594. require.True(t, resp)
  595. llm.pingResp = nil
  596. resp = runner.needsReload(ctx, req)
  597. require.False(t, resp)
  598. req.opts.NumGPU = 99
  599. resp = runner.needsReload(ctx, req)
  600. require.True(t, resp)
  601. req.opts.NumGPU = -1
  602. resp = runner.needsReload(ctx, req)
  603. require.False(t, resp)
  604. }
  605. func TestUnloadAllRunners(t *testing.T) {
  606. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  607. defer done()
  608. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  609. llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  610. s := InitScheduler(ctx)
  611. s.unloadAllRunners()
  612. r1 := &runnerRef{llama: llm1, numParallel: 1}
  613. r2 := &runnerRef{llama: llm2, numParallel: 1}
  614. s.loadedMu.Lock()
  615. s.loaded["a"] = r1
  616. s.loaded["b"] = r2
  617. s.loadedMu.Unlock()
  618. s.unloadAllRunners()
  619. require.True(t, llm1.closeCalled)
  620. require.True(t, llm2.closeCalled)
  621. }
  622. func TestUnload(t *testing.T) {
  623. llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
  624. r1 := &runnerRef{llama: llm1, numParallel: 1}
  625. r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
  626. r1.unload()
  627. require.True(t, llm1.closeCalled)
  628. r2.unload()
  629. require.Nil(t, r2.model)
  630. }
  631. func TestAlreadyCanceled(t *testing.T) {
  632. ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
  633. defer done()
  634. dctx, done2 := context.WithCancel(ctx)
  635. done2()
  636. scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
  637. s := InitScheduler(ctx)
  638. slog.Info("scenario1a")
  639. s.pendingReqCh <- scenario1a.req
  640. require.Len(t, s.pendingReqCh, 1)
  641. s.Run(ctx)
  642. time.Sleep(5 * time.Millisecond)
  643. require.Empty(t, s.pendingReqCh)
  644. require.Empty(t, scenario1a.req.errCh)
  645. require.Empty(t, scenario1a.req.successCh)
  646. }
  647. type mockLlm struct {
  648. pingResp error
  649. waitResp error
  650. completionResp error
  651. embeddingResp []float64
  652. embeddingRespErr error
  653. tokenizeResp []int
  654. tokenizeRespErr error
  655. detokenizeResp string
  656. detonekizeRespErr error
  657. closeResp error
  658. closeCalled bool
  659. estimatedVRAM uint64
  660. estimatedTotal uint64
  661. estimatedVRAMByGPU map[string]uint64
  662. }
  663. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  664. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  665. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  666. return s.completionResp
  667. }
  668. func (s *mockLlm) Embedding(ctx context.Context, prompt string) ([]float64, error) {
  669. return s.embeddingResp, s.embeddingRespErr
  670. }
  671. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  672. return s.tokenizeResp, s.tokenizeRespErr
  673. }
  674. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  675. return s.detokenizeResp, s.detonekizeRespErr
  676. }
  677. func (s *mockLlm) Close() error {
  678. s.closeCalled = true
  679. return s.closeResp
  680. }
  681. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }
  682. func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal }
  683. func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }