sched_test.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. package server
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/binary"
  6. "fmt"
  7. "log/slog"
  8. "os"
  9. "testing"
  10. "time"
  11. "github.com/ollama/ollama/api"
  12. "github.com/ollama/ollama/app/lifecycle"
  13. "github.com/ollama/ollama/format"
  14. "github.com/ollama/ollama/gpu"
  15. "github.com/ollama/ollama/llm"
  16. "github.com/stretchr/testify/assert"
  17. "github.com/stretchr/testify/require"
  18. )
  19. func init() {
  20. os.Setenv("OLLAMA_DEBUG", "1")
  21. lifecycle.InitLogging()
  22. }
  23. func TestInitScheduler(t *testing.T) {
  24. ctx, done := context.WithCancel(context.Background())
  25. defer done()
  26. initialMax := loadedMax
  27. s := InitScheduler(ctx)
  28. require.Equal(t, initialMax, loadedMax)
  29. require.NotNil(t, s.loaded)
  30. os.Setenv("OLLAMA_MAX_LOADED_MODELS", "blue")
  31. s = InitScheduler(ctx)
  32. require.Equal(t, initialMax, loadedMax)
  33. require.NotNil(t, s.loaded)
  34. os.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
  35. s = InitScheduler(ctx)
  36. require.Equal(t, 0, loadedMax)
  37. require.NotNil(t, s.loaded)
  38. }
  39. func TestLoad(t *testing.T) {
  40. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  41. defer done()
  42. s := InitScheduler(ctx)
  43. var ggml *llm.GGML // value not used in tests
  44. req := &LlmRequest{
  45. ctx: ctx,
  46. model: &Model{ModelPath: "foo"},
  47. successCh: make(chan *runnerRef, 1),
  48. errCh: make(chan error, 1),
  49. sessionDuration: 2,
  50. }
  51. // Fail to load model first
  52. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  53. return nil, fmt.Errorf("something failed to load model blah")
  54. }
  55. gpus := gpu.GpuInfoList{}
  56. s.load(req, ggml, gpus)
  57. require.Len(t, req.successCh, 0)
  58. require.Len(t, req.errCh, 1)
  59. require.Len(t, s.loaded, 0)
  60. err := <-req.errCh
  61. require.Contains(t, err.Error(), "this model may be incompatible")
  62. server := &mockLlm{estimatedVRAM: 10}
  63. s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  64. return server, nil
  65. }
  66. s.load(req, ggml, gpus)
  67. select {
  68. case err := <-req.errCh:
  69. require.NoError(t, err)
  70. case resp := <-req.successCh:
  71. require.Equal(t, uint64(10), resp.estimatedVRAM)
  72. require.Equal(t, uint(1), resp.refCount)
  73. require.Len(t, s.loaded, 1)
  74. }
  75. req.model.ModelPath = "dummy_model_path"
  76. server.waitResp = fmt.Errorf("wait failure")
  77. s.load(req, ggml, gpus)
  78. select {
  79. case err := <-req.errCh:
  80. require.Contains(t, err.Error(), "wait failure")
  81. case resp := <-req.successCh:
  82. t.Errorf("unexpected success %v", resp)
  83. }
  84. runner := s.loaded["dummy_model_path"]
  85. require.NotNil(t, runner)
  86. require.Equal(t, uint(0), runner.refCount)
  87. time.Sleep(1 * time.Millisecond)
  88. require.Len(t, s.expiredCh, 1)
  89. }
  90. type bundle struct {
  91. ctx context.Context //nolint:containedctx
  92. ctxDone func()
  93. srv *mockLlm
  94. req *LlmRequest
  95. ggml *llm.GGML
  96. }
  97. func (scenario *bundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options) (llm.LlamaServer, error) {
  98. return scenario.srv, nil
  99. }
  100. func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64) *bundle {
  101. scenario := &bundle{}
  102. scenario.ctx, scenario.ctxDone = context.WithCancel(ctx)
  103. t.Helper()
  104. f, err := os.CreateTemp(t.TempDir(), modelName)
  105. assert.Nil(t, err)
  106. defer f.Close()
  107. gguf := llm.NewGGUFV3(binary.LittleEndian)
  108. err = gguf.Encode(f, llm.KV{
  109. "general.architecture": "llama",
  110. "general.name": "name",
  111. "llama.context_length": uint32(32),
  112. "llama.embedding_length": uint32(4096),
  113. "llama.block_count": uint32(1),
  114. "llama.attention.head_count": uint32(32),
  115. "llama.attention.head_count_kv": uint32(32),
  116. "tokenizer.ggml.tokens": []string{" "},
  117. "tokenizer.ggml.scores": []float32{0},
  118. "tokenizer.ggml.token_type": []int32{0},
  119. }, []llm.Tensor{
  120. {Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
  121. })
  122. assert.Nil(t, err)
  123. fname := f.Name()
  124. model := &Model{Name: modelName, ModelPath: fname}
  125. scenario.ggml, err = llm.LoadModel(model.ModelPath)
  126. require.NoError(t, err)
  127. scenario.req = &LlmRequest{
  128. ctx: scenario.ctx,
  129. model: model,
  130. sessionDuration: 5 * time.Millisecond,
  131. successCh: make(chan *runnerRef, 1),
  132. errCh: make(chan error, 1),
  133. }
  134. scenario.srv = &mockLlm{estimatedVRAM: estimatedVRAM}
  135. return scenario
  136. }
  137. func TestRequests(t *testing.T) {
  138. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  139. defer done()
  140. // Same model, same request
  141. scenario1a := newScenario(t, ctx, "ollama-model-1", 10)
  142. scenario1a.req.sessionDuration = 0
  143. scenario1b := newScenario(t, ctx, "ollama-model-1", 11)
  144. scenario1b.req.model = scenario1a.req.model
  145. scenario1b.ggml = scenario1a.ggml
  146. scenario1b.req.sessionDuration = 0
  147. // simple reload of same model
  148. scenario2a := newScenario(t, ctx, "ollama-model-1", 20)
  149. scenario2a.req.model = scenario1a.req.model
  150. scenario2a.ggml = scenario1a.ggml
  151. // Multiple loaded models
  152. scenario3a := newScenario(t, ctx, "ollama-model-3a", 1*format.GigaByte)
  153. scenario3b := newScenario(t, ctx, "ollama-model-3b", 24*format.GigaByte)
  154. scenario3c := newScenario(t, ctx, "ollama-model-3c", 30) // Needs prior unloaded
  155. s := InitScheduler(ctx)
  156. s.getGpuFn = func() gpu.GpuInfoList {
  157. g := gpu.GpuInfo{Library: "metal"}
  158. g.TotalMemory = 24 * format.GigaByte
  159. g.FreeMemory = 12 * format.GigaByte
  160. return []gpu.GpuInfo{g}
  161. }
  162. s.newServerFn = scenario1a.newServer
  163. slog.Info("scenario1a")
  164. s.pendingReqCh <- scenario1a.req
  165. require.Len(t, s.pendingReqCh, 1)
  166. s.Run(ctx)
  167. select {
  168. case resp := <-scenario1a.req.successCh:
  169. require.Equal(t, resp.llama, scenario1a.srv)
  170. require.Len(t, s.pendingReqCh, 0)
  171. require.Len(t, scenario1a.req.errCh, 0)
  172. case <-ctx.Done():
  173. t.Errorf("timeout")
  174. }
  175. // Same runner as first request due to not needing a reload
  176. s.newServerFn = scenario1b.newServer
  177. slog.Info("scenario1b")
  178. s.pendingReqCh <- scenario1b.req
  179. select {
  180. case resp := <-scenario1b.req.successCh:
  181. require.Equal(t, resp.llama, scenario1a.srv)
  182. require.Len(t, s.pendingReqCh, 0)
  183. require.Len(t, scenario1b.req.errCh, 0)
  184. case <-ctx.Done():
  185. t.Errorf("timeout")
  186. }
  187. // Trigger a reload
  188. s.newServerFn = scenario2a.newServer
  189. scenario2a.req.model.AdapterPaths = []string{"new"}
  190. slog.Info("scenario2a")
  191. s.pendingReqCh <- scenario2a.req
  192. // finish first two requests, so model can reload
  193. time.Sleep(1 * time.Millisecond)
  194. scenario1a.ctxDone()
  195. scenario1b.ctxDone()
  196. select {
  197. case resp := <-scenario2a.req.successCh:
  198. require.Equal(t, resp.llama, scenario2a.srv)
  199. require.Len(t, s.pendingReqCh, 0)
  200. require.Len(t, scenario2a.req.errCh, 0)
  201. case <-ctx.Done():
  202. t.Errorf("timeout")
  203. }
  204. loadedMax = 1
  205. s.newServerFn = scenario3a.newServer
  206. slog.Info("scenario3a")
  207. s.pendingReqCh <- scenario3a.req
  208. // finish prior request, so new model can load
  209. time.Sleep(1 * time.Millisecond)
  210. scenario2a.ctxDone()
  211. select {
  212. case resp := <-scenario3a.req.successCh:
  213. require.Equal(t, resp.llama, scenario3a.srv)
  214. require.Len(t, s.pendingReqCh, 0)
  215. require.Len(t, scenario3a.req.errCh, 0)
  216. case <-ctx.Done():
  217. t.Errorf("timeout")
  218. }
  219. require.Len(t, s.loaded, 1)
  220. loadedMax = 0
  221. s.newServerFn = scenario3b.newServer
  222. slog.Info("scenario3b")
  223. s.pendingReqCh <- scenario3b.req
  224. select {
  225. case resp := <-scenario3b.req.successCh:
  226. require.Equal(t, resp.llama, scenario3b.srv)
  227. require.Len(t, s.pendingReqCh, 0)
  228. require.Len(t, scenario3b.req.errCh, 0)
  229. case <-ctx.Done():
  230. t.Errorf("timeout")
  231. }
  232. require.Len(t, s.loaded, 2)
  233. // Try to load a model that wont fit
  234. s.newServerFn = scenario3c.newServer
  235. slog.Info("scenario3c")
  236. require.Len(t, s.loaded, 2)
  237. scenario3a.ctxDone() // Won't help since this one isn't big enough to make room
  238. time.Sleep(2 * time.Millisecond)
  239. s.pendingReqCh <- scenario3c.req
  240. // finish prior request, so new model can load
  241. time.Sleep(6 * time.Millisecond)
  242. require.Len(t, s.loaded, 1)
  243. scenario3b.ctxDone()
  244. select {
  245. case resp := <-scenario3c.req.successCh:
  246. require.Equal(t, resp.llama, scenario3c.srv)
  247. require.Len(t, s.pendingReqCh, 0)
  248. require.Len(t, scenario3c.req.errCh, 0)
  249. case <-ctx.Done():
  250. t.Errorf("timeout")
  251. }
  252. require.Len(t, s.loaded, 1)
  253. }
  254. func TestGetRunner(t *testing.T) {
  255. ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
  256. defer done()
  257. // Same model, same request
  258. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  259. scenario1a.req.sessionDuration = 0
  260. scenario1b := newScenario(t, ctx, "ollama-model-1b", 10)
  261. scenario1b.req.sessionDuration = 0
  262. scenario1c := newScenario(t, ctx, "ollama-model-1c", 10)
  263. scenario1c.req.sessionDuration = 0
  264. maxQueuedRequests = 1
  265. s := InitScheduler(ctx)
  266. s.getGpuFn = func() gpu.GpuInfoList {
  267. g := gpu.GpuInfo{Library: "metal"}
  268. g.TotalMemory = 24 * format.GigaByte
  269. g.FreeMemory = 12 * format.GigaByte
  270. return []gpu.GpuInfo{g}
  271. }
  272. s.newServerFn = scenario1a.newServer
  273. slog.Info("scenario1a")
  274. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  275. require.Len(t, s.pendingReqCh, 1)
  276. slog.Info("scenario1b")
  277. successCh1b, errCh1b := s.GetRunner(scenario1b.ctx, scenario1b.req.model, scenario1b.req.opts, scenario1b.req.sessionDuration)
  278. require.Len(t, s.pendingReqCh, 1)
  279. require.Len(t, successCh1b, 0)
  280. require.Len(t, errCh1b, 1)
  281. err := <-errCh1b
  282. require.Contains(t, err.Error(), "server busy")
  283. s.Run(ctx)
  284. select {
  285. case resp := <-successCh1a:
  286. require.Equal(t, resp.llama, scenario1a.srv)
  287. require.Len(t, s.pendingReqCh, 0)
  288. require.Len(t, errCh1a, 0)
  289. case <-ctx.Done():
  290. t.Errorf("timeout")
  291. }
  292. scenario1a.ctxDone()
  293. require.Len(t, s.loaded, 1)
  294. scenario1c.req.model.ModelPath = "bad path"
  295. slog.Info("scenario1c")
  296. successCh1c, errCh1c := s.GetRunner(scenario1c.ctx, scenario1c.req.model, scenario1c.req.opts, scenario1c.req.sessionDuration)
  297. require.Len(t, s.pendingReqCh, 0)
  298. require.Len(t, successCh1c, 0)
  299. require.Len(t, errCh1c, 0)
  300. time.Sleep(5 * time.Millisecond)
  301. require.Len(t, s.loaded, 0)
  302. require.Len(t, errCh1c, 1)
  303. err = <-errCh1c
  304. require.Contains(t, err.Error(), "bad path")
  305. scenario1b.ctxDone()
  306. }
  307. // TODO - add one scenario that triggers the bogus finished event with positive ref count
  308. func TestPrematureExpired(t *testing.T) {
  309. ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
  310. defer done()
  311. // Same model, same request
  312. scenario1a := newScenario(t, ctx, "ollama-model-1a", 10)
  313. s := InitScheduler(ctx)
  314. s.getGpuFn = func() gpu.GpuInfoList {
  315. g := gpu.GpuInfo{Library: "metal"}
  316. g.TotalMemory = 24 * format.GigaByte
  317. g.FreeMemory = 12 * format.GigaByte
  318. return []gpu.GpuInfo{g}
  319. }
  320. s.newServerFn = scenario1a.newServer
  321. successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
  322. require.Len(t, s.pendingReqCh, 1)
  323. s.Run(ctx)
  324. select {
  325. case resp := <-successCh1a:
  326. require.Equal(t, resp.llama, scenario1a.srv)
  327. require.Len(t, s.pendingReqCh, 0)
  328. require.Len(t, errCh1a, 0)
  329. require.Len(t, s.loaded, 1)
  330. slog.Info("sending premature expired event now")
  331. s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
  332. case <-ctx.Done():
  333. t.Errorf("timeout")
  334. }
  335. time.Sleep(scenario1a.req.sessionDuration)
  336. scenario1a.ctxDone()
  337. time.Sleep(20 * time.Millisecond)
  338. require.LessOrEqual(t, len(s.finishedReqCh), 1)
  339. time.Sleep(10 * time.Millisecond)
  340. require.Len(t, s.finishedReqCh, 0)
  341. s.loadedMu.Lock()
  342. require.Len(t, s.loaded, 0)
  343. s.loadedMu.Unlock()
  344. // also shouldn't happen in real life
  345. s.finishedReqCh <- scenario1a.req
  346. time.Sleep(5 * time.Millisecond)
  347. }
  348. func TestUseLoadedRunner(t *testing.T) {
  349. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  350. req := &LlmRequest{
  351. ctx: ctx,
  352. successCh: make(chan *runnerRef, 1),
  353. sessionDuration: 2,
  354. }
  355. finished := make(chan *LlmRequest)
  356. llm1 := &mockLlm{}
  357. r1 := &runnerRef{llama: llm1, sessionDuration: 1}
  358. req.useLoadedRunner(r1, finished)
  359. require.Equal(t, uint(1), r1.refCount)
  360. require.Equal(t, time.Duration(2), r1.sessionDuration)
  361. select {
  362. case success := <-req.successCh:
  363. require.Equal(t, r1, success)
  364. case <-ctx.Done():
  365. t.Errorf("timeout")
  366. }
  367. done()
  368. fin := <-finished
  369. require.Equal(t, req, fin)
  370. }
  371. func TestUpdateFreeSpace(t *testing.T) {
  372. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  373. defer done()
  374. gpus := gpu.GpuInfoList{
  375. {
  376. Library: "a",
  377. ID: "1",
  378. },
  379. {
  380. Library: "a",
  381. ID: "2",
  382. },
  383. }
  384. gpus[0].TotalMemory = 1000
  385. gpus[0].FreeMemory = 900
  386. gpus[1].TotalMemory = 2000
  387. gpus[1].FreeMemory = 1900
  388. llm1 := &mockLlm{estimatedVRAM: 100}
  389. llm2 := &mockLlm{estimatedVRAM: 200}
  390. r1 := &runnerRef{llama: llm1, gpus: gpus}
  391. r2 := &runnerRef{llama: llm2, gpus: gpus}
  392. s := InitScheduler(ctx)
  393. s.loaded["a"] = r1
  394. s.loaded["b"] = r2
  395. s.updateFreeSpace(gpus)
  396. require.Equal(t, uint64(850), gpus[0].FreeMemory)
  397. require.Equal(t, uint64(1850), gpus[1].FreeMemory)
  398. }
  399. func TestFindRunnerToUnload(t *testing.T) {
  400. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  401. defer done()
  402. req := &LlmRequest{ctx: ctx}
  403. r1 := &runnerRef{refCount: 1, sessionDuration: 1}
  404. r2 := &runnerRef{sessionDuration: 2}
  405. s := InitScheduler(ctx)
  406. s.loaded["a"] = r1
  407. s.loaded["b"] = r2
  408. resp := s.findRunnerToUnload(req)
  409. require.Equal(t, r2, resp)
  410. r2.refCount = 1
  411. resp = s.findRunnerToUnload(req)
  412. require.Equal(t, r1, resp)
  413. }
  414. func TestNeedsReload(t *testing.T) {
  415. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  416. defer done()
  417. llm := &mockLlm{}
  418. runner := &runnerRef{
  419. adapters: []string{"adapter1"},
  420. projectors: []string{"projector1"},
  421. Options: &api.Options{},
  422. llama: llm,
  423. }
  424. req := &LlmRequest{
  425. model: &Model{
  426. AdapterPaths: []string{"adapter2"},
  427. ProjectorPaths: []string{"projector2"},
  428. },
  429. opts: api.Options{},
  430. }
  431. resp := runner.needsReload(ctx, req)
  432. require.True(t, resp)
  433. req.model.AdapterPaths = runner.adapters
  434. resp = runner.needsReload(ctx, req)
  435. require.True(t, resp)
  436. req.model.ProjectorPaths = runner.projectors
  437. runner.loading = true
  438. req.opts.NumBatch = 1234
  439. resp = runner.needsReload(ctx, req)
  440. require.True(t, resp)
  441. req.opts.NumBatch = runner.Options.NumBatch
  442. llm.pingResp = fmt.Errorf("foo")
  443. resp = runner.needsReload(ctx, req)
  444. require.True(t, resp)
  445. llm.pingResp = nil
  446. resp = runner.needsReload(ctx, req)
  447. require.False(t, resp)
  448. req.opts.NumGPU = 99
  449. resp = runner.needsReload(ctx, req)
  450. require.True(t, resp)
  451. req.opts.NumGPU = -1
  452. resp = runner.needsReload(ctx, req)
  453. require.False(t, resp)
  454. }
  455. func TestUnloadAllRunners(t *testing.T) {
  456. ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
  457. defer done()
  458. llm1 := &mockLlm{}
  459. llm2 := &mockLlm{}
  460. s := InitScheduler(ctx)
  461. s.unloadAllRunners()
  462. r1 := &runnerRef{llama: llm1}
  463. r2 := &runnerRef{llama: llm2}
  464. s.loaded["a"] = r1
  465. s.loaded["b"] = r2
  466. s.unloadAllRunners()
  467. require.True(t, llm1.closeCalled)
  468. require.True(t, llm2.closeCalled)
  469. }
  470. func TestUnload(t *testing.T) {
  471. llm1 := &mockLlm{}
  472. r1 := &runnerRef{llama: llm1}
  473. r2 := &runnerRef{adapters: []string{"A"}}
  474. r1.unload()
  475. require.True(t, llm1.closeCalled)
  476. r2.unload()
  477. require.Nil(t, r2.adapters)
  478. }
  479. type mockLlm struct {
  480. pingResp error
  481. waitResp error
  482. completionResp error
  483. embeddingResp [][]float64
  484. embeddingRespErr error
  485. tokenizeResp []int
  486. tokenizeRespErr error
  487. detokenizeResp string
  488. detonekizeRespErr error
  489. closeResp error
  490. closeCalled bool
  491. estimatedVRAM uint64
  492. }
  493. func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
  494. func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
  495. func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
  496. return s.completionResp
  497. }
  498. func (s *mockLlm) Embeddings(ctx context.Context, prompts []string) ([][]float64, error) {
  499. return s.embeddingResp, s.embeddingRespErr
  500. }
  501. func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
  502. return s.tokenizeResp, s.tokenizeRespErr
  503. }
  504. func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
  505. return s.detokenizeResp, s.detonekizeRespErr
  506. }
  507. func (s *mockLlm) Close() error {
  508. s.closeCalled = true
  509. return s.closeResp
  510. }
  511. func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM }