memory.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "os"
  6. "strconv"
  7. "github.com/ollama/ollama/api"
  8. "github.com/ollama/ollama/format"
  9. "github.com/ollama/ollama/gpu"
  10. )
  11. // This algorithm looks for a complete fit to determine if we need to unload other models
  12. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  13. var estimatedVRAM uint64
  14. if opts.NumCtx > int(ggml.KV().ContextLength()) {
  15. slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
  16. opts.NumCtx = int(ggml.KV().ContextLength())
  17. }
  18. if opts.NumCtx < 4 {
  19. opts.NumCtx = 4
  20. }
  21. // Split up the GPUs by type and try them
  22. for _, gpus := range allGpus.ByLibrary() {
  23. var layerCount int
  24. layerCount, estimatedVRAM = EstimateGPULayers(gpus, ggml, projectors, opts)
  25. if opts.NumGPU < 0 {
  26. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  27. return true, estimatedVRAM
  28. }
  29. } else {
  30. if layerCount > 0 && layerCount >= opts.NumGPU {
  31. return true, estimatedVRAM
  32. }
  33. }
  34. }
  35. return false, estimatedVRAM
  36. }
  37. // Given a model and one or more GPU targets, predict how many layers and bytes we can load
  38. // The GPUs provided must all be the same Library
  39. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64) {
  40. if gpus[0].Library == "cpu" {
  41. return 0, 0
  42. }
  43. var memoryAvailable uint64
  44. for _, info := range gpus {
  45. memoryAvailable += info.FreeMemory
  46. }
  47. userLimit := os.Getenv("OLLAMA_MAX_VRAM")
  48. if userLimit != "" {
  49. avail, err := strconv.ParseUint(userLimit, 10, 64)
  50. if err != nil {
  51. slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
  52. } else {
  53. slog.Info("user override memory limit", "OLLAMA_MAX_VRAM", avail, "actual", memoryAvailable)
  54. memoryAvailable = avail
  55. }
  56. }
  57. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
  58. // TODO - this is probably wrong, first GPU vs secondaries will have different overheads
  59. memoryMinimum := gpus[0].MinimumMemory
  60. for _, projector := range projectors {
  61. memoryMinimum += projectorMemoryRequirements(projector)
  62. // multimodal models require at least 2048 context
  63. opts.NumCtx = max(opts.NumCtx, 2048)
  64. }
  65. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  66. var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
  67. graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  68. if graphPartialOffload == 0 {
  69. graphPartialOffload = ggml.KV().GQA() * kv / 6
  70. }
  71. if graphFullOffload == 0 {
  72. graphFullOffload = graphPartialOffload
  73. }
  74. graphFullOffload *= uint64(len(gpus))
  75. graphPartialOffload *= uint64(len(gpus))
  76. // on metal there's no partial offload overhead
  77. if gpus[0].Library == "metal" {
  78. graphPartialOffload = graphFullOffload
  79. }
  80. // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
  81. memoryRequiredTotal := memoryMinimum + graphFullOffload
  82. // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
  83. memoryRequiredPartial := memoryMinimum + graphPartialOffload
  84. if memoryRequiredPartial > memoryAvailable {
  85. slog.Debug("insufficient VRAM to load any model layers")
  86. return 0, 0
  87. }
  88. layers := ggml.Tensors().Layers()
  89. var memoryLayerOutput uint64
  90. if layer, ok := layers["output_norm"]; ok {
  91. memoryLayerOutput += layer.size()
  92. }
  93. if layer, ok := layers["output"]; ok {
  94. memoryLayerOutput += layer.size()
  95. } else if layer, ok := layers["token_embd"]; ok {
  96. memoryLayerOutput += layer.size()
  97. }
  98. if gpus[0].Library == "metal" && opts.UseMMap {
  99. // memory is preallocated for output tensors
  100. memoryRequiredTotal += memoryLayerOutput
  101. memoryRequiredPartial += memoryLayerOutput
  102. }
  103. var layerCount int
  104. for i := 0; i < int(ggml.KV().BlockCount()); i++ {
  105. memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
  106. // KV is proportional to the number of layers
  107. memoryLayer += kv / ggml.KV().BlockCount()
  108. memoryRequiredTotal += memoryLayer
  109. if memoryAvailable > memoryRequiredPartial+memoryLayer {
  110. memoryRequiredPartial += memoryLayer
  111. layerCount++
  112. }
  113. }
  114. if gpus[0].Library != "metal" || !opts.UseMMap {
  115. // memory was not preallocated for output tensors
  116. memoryRequiredTotal += memoryLayerOutput
  117. }
  118. if memoryAvailable > memoryRequiredTotal {
  119. layerCount = int(ggml.KV().BlockCount()) + 1
  120. memoryRequiredPartial = memoryRequiredTotal
  121. }
  122. memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
  123. slog.Info(
  124. "offload to gpu",
  125. slog.Group(
  126. "layers",
  127. // actual number of layers offloaded
  128. "real", opts.NumGPU,
  129. // estimated number of layers that can be offloaded
  130. "estimate", layerCount,
  131. ),
  132. slog.Group(
  133. "memory",
  134. // memory available for offloading
  135. "available", format.HumanBytes2(memoryAvailable),
  136. slog.Group(
  137. "required",
  138. // memory required for full offloading
  139. "full", format.HumanBytes2(memoryRequiredTotal),
  140. // memory required to offload layers.estimate layers
  141. "partial", format.HumanBytes2(memoryRequiredPartial),
  142. // memory of KV cache
  143. "kv", format.HumanBytes2(kv),
  144. ),
  145. slog.Group(
  146. "weights",
  147. // memory of the weights
  148. "total", format.HumanBytes2(memoryWeights),
  149. // memory of repeating layers
  150. "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
  151. // memory of non-repeating layers
  152. "nonrepeating", format.HumanBytes2(memoryLayerOutput),
  153. ),
  154. slog.Group(
  155. "graph",
  156. // memory of graph when fully offloaded
  157. "full", format.HumanBytes2(graphFullOffload),
  158. // memory of graph when not fully offloaded
  159. "partial", format.HumanBytes2(graphPartialOffload),
  160. ),
  161. ),
  162. )
  163. return layerCount, uint64(memoryRequiredPartial)
  164. }