memory.go 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "strings"
  6. "github.com/ollama/ollama/api"
  7. "github.com/ollama/ollama/format"
  8. "github.com/ollama/ollama/gpu"
  9. )
  10. // This algorithm looks for a complete fit to determine if we need to unload other models
  11. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  12. var estimatedVRAM uint64
  13. if opts.NumCtx > int(ggml.KV().ContextLength()) {
  14. slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
  15. opts.NumCtx = int(ggml.KV().ContextLength())
  16. }
  17. if opts.NumCtx < 4 {
  18. opts.NumCtx = 4
  19. }
  20. // Split up the GPUs by type and try them
  21. for _, gpus := range allGpus.ByLibrary() {
  22. var layerCount int
  23. layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
  24. if opts.NumGPU < 0 {
  25. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  26. return true, estimatedVRAM
  27. }
  28. } else {
  29. if layerCount > 0 && layerCount >= opts.NumGPU {
  30. return true, estimatedVRAM
  31. }
  32. }
  33. }
  34. return false, estimatedVRAM
  35. }
  36. // Given a model and one or more GPU targets, predict how many layers and bytes we can load
  37. // The GPUs provided must all be the same Library
  38. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
  39. if gpus[0].Library == "cpu" {
  40. return 0, 0
  41. }
  42. var memoryAvailable uint64
  43. for _, info := range gpus {
  44. memoryAvailable += info.FreeMemory
  45. }
  46. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
  47. // TODO - this is probably wrong, first GPU vs secondaries will have different overheads
  48. memoryMinimum := gpus[0].MinimumMemory
  49. for _, projector := range projectors {
  50. memoryMinimum += projectorMemoryRequirements(projector)
  51. // multimodal models require at least 2048 context
  52. opts.NumCtx = max(opts.NumCtx, 2048)
  53. }
  54. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  55. var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
  56. graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  57. if graphPartialOffload == 0 {
  58. graphPartialOffload = ggml.KV().GQA() * kv / 6
  59. }
  60. if graphFullOffload == 0 {
  61. graphFullOffload = graphPartialOffload
  62. }
  63. graphFullOffload *= uint64(len(gpus))
  64. graphPartialOffload *= uint64(len(gpus))
  65. // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
  66. memoryRequiredTotal := memoryMinimum + graphFullOffload
  67. // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
  68. memoryRequiredPartial := memoryMinimum + graphPartialOffload
  69. if memoryRequiredPartial > memoryAvailable {
  70. slog.Debug("insufficient VRAM to load any model layers")
  71. return 0, 0
  72. }
  73. var layerCount int
  74. layers := ggml.Tensors().Layers()
  75. for i := 0; i < int(ggml.KV().BlockCount()); i++ {
  76. memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
  77. // KV is proportional to the number of layers
  78. memoryLayer += kv / ggml.KV().BlockCount()
  79. memoryRequiredTotal += memoryLayer
  80. if memoryAvailable > memoryRequiredPartial+memoryLayer {
  81. memoryRequiredPartial += memoryLayer
  82. layerCount++
  83. }
  84. }
  85. var memoryLayerOutput uint64
  86. for k, v := range layers {
  87. if !strings.HasPrefix(k, "blk.") {
  88. memoryLayerOutput += v.size()
  89. }
  90. }
  91. memoryRequiredTotal += memoryLayerOutput
  92. if memoryAvailable > memoryRequiredTotal {
  93. layerCount = int(ggml.KV().BlockCount()) + 1
  94. memoryRequiredPartial = memoryRequiredTotal
  95. }
  96. memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
  97. slog.Info(
  98. "offload to gpu",
  99. slog.Group(
  100. "layers",
  101. // actual number of layers offloaded
  102. "real", opts.NumGPU,
  103. // estimated number of layers that can be offloaded
  104. "estimate", layerCount,
  105. ),
  106. slog.Group(
  107. "memory",
  108. // memory available for offloading
  109. "available", format.HumanBytes2(memoryAvailable),
  110. slog.Group(
  111. "required",
  112. // memory required for full offloading
  113. "full", format.HumanBytes2(memoryRequiredTotal),
  114. // memory required to offload layers.estimate layers
  115. "partial", format.HumanBytes2(memoryRequiredPartial),
  116. // memory of KV cache
  117. "kv", format.HumanBytes2(kv),
  118. ),
  119. slog.Group(
  120. "weights",
  121. // memory of the weights
  122. "total", format.HumanBytes2(memoryWeights),
  123. // memory of repeating layers
  124. "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
  125. // memory of non-repeating layers
  126. "nonrepeating", format.HumanBytes2(memoryLayerOutput),
  127. ),
  128. slog.Group(
  129. "graph",
  130. // memory of graph when fully offloaded
  131. "full", format.HumanBytes2(graphFullOffload),
  132. // memory of graph when not fully offloaded
  133. "partial", format.HumanBytes2(graphPartialOffload),
  134. ),
  135. ),
  136. )
  137. return layerCount, uint64(memoryRequiredPartial)
  138. }