memory.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "github.com/ollama/ollama/api"
  6. "github.com/ollama/ollama/format"
  7. "github.com/ollama/ollama/gpu"
  8. "github.com/ollama/ollama/envconfig"
  9. )
  10. // This algorithm looks for a complete fit to determine if we need to unload other models
  11. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  12. // Split up the GPUs by type and try them
  13. var estimatedVRAM uint64
  14. for _, gpus := range allGpus.ByLibrary() {
  15. var layerCount int
  16. layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
  17. if opts.NumGPU < 0 {
  18. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  19. return true, estimatedVRAM
  20. }
  21. } else {
  22. if layerCount > 0 && layerCount >= opts.NumGPU {
  23. return true, estimatedVRAM
  24. }
  25. }
  26. }
  27. return false, estimatedVRAM
  28. }
  29. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  30. // The GPUs provided must all be the same Library
  31. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
  32. var memoryAvailable uint64
  33. for _, info := range gpus {
  34. memoryAvailable += info.FreeMemory
  35. }
  36. if envconfig.MaxVRAM > 0 {
  37. memoryAvailable = envconfig.MaxVRAM
  38. }
  39. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
  40. // TODO - this is probably wrong, first GPU vs secondaries will have different overheads
  41. memoryMinimum := gpus[0].MinimumMemory
  42. for _, projector := range projectors {
  43. memoryMinimum += projectorMemoryRequirements(projector)
  44. // multimodal models require at least 2048 context
  45. opts.NumCtx = max(opts.NumCtx, 2048)
  46. }
  47. layers := ggml.Tensors().Layers()
  48. // add one layer worth of memory as a buffer
  49. if blk0, ok := layers["blk.0"]; ok {
  50. memoryMinimum += blk0.size()
  51. }
  52. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  53. var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
  54. graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  55. if graphPartialOffload == 0 {
  56. graphPartialOffload = ggml.KV().GQA() * kv / 6
  57. }
  58. if graphFullOffload == 0 {
  59. graphFullOffload = graphPartialOffload
  60. }
  61. graphFullOffload *= uint64(len(gpus))
  62. graphPartialOffload *= uint64(len(gpus))
  63. // on metal there's no partial offload overhead
  64. if gpus[0].Library == "metal" {
  65. graphPartialOffload = graphFullOffload
  66. }
  67. // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
  68. memoryRequiredTotal := memoryMinimum + graphFullOffload
  69. // memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
  70. memoryRequiredPartial := memoryMinimum + graphPartialOffload
  71. var memoryLayerOutput uint64
  72. if layer, ok := layers["output_norm"]; ok {
  73. memoryLayerOutput += layer.size()
  74. }
  75. if layer, ok := layers["output"]; ok {
  76. memoryLayerOutput += layer.size()
  77. } else if layer, ok := layers["token_embd"]; ok {
  78. memoryLayerOutput += layer.size()
  79. }
  80. if gpus[0].Library == "metal" && opts.UseMMap {
  81. // memory is preallocated for output tensors
  82. memoryRequiredTotal += memoryLayerOutput
  83. memoryRequiredPartial += memoryLayerOutput
  84. }
  85. var layerCount int
  86. for i := 0; i < int(ggml.KV().BlockCount()); i++ {
  87. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  88. memoryLayer := blk.size()
  89. // KV is proportional to the number of layers
  90. memoryLayer += kv / ggml.KV().BlockCount()
  91. memoryRequiredTotal += memoryLayer
  92. if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
  93. memoryRequiredPartial += memoryLayer
  94. layerCount++
  95. }
  96. }
  97. }
  98. if gpus[0].Library != "metal" || !opts.UseMMap {
  99. // memory was not preallocated for output tensors
  100. memoryRequiredTotal += memoryLayerOutput
  101. }
  102. if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
  103. layerCount = int(ggml.KV().BlockCount()) + 1
  104. memoryRequiredPartial = memoryRequiredTotal
  105. }
  106. memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
  107. slog.Info(
  108. "offload to gpu",
  109. slog.Group(
  110. "layers",
  111. // requested number of layers to offload
  112. "requested", opts.NumGPU,
  113. // estimated number of layers that can be offloaded
  114. "real", layerCount,
  115. ),
  116. slog.Group(
  117. "memory",
  118. // memory available for offloading
  119. "available", format.HumanBytes2(memoryAvailable),
  120. slog.Group(
  121. "required",
  122. // memory required for full offloading
  123. "full", format.HumanBytes2(memoryRequiredTotal),
  124. // memory required to offload layers.estimate layers
  125. "partial", format.HumanBytes2(memoryRequiredPartial),
  126. // memory of KV cache
  127. "kv", format.HumanBytes2(kv),
  128. ),
  129. slog.Group(
  130. "weights",
  131. // memory of the weights
  132. "total", format.HumanBytes2(memoryWeights),
  133. // memory of repeating layers
  134. "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
  135. // memory of non-repeating layers
  136. "nonrepeating", format.HumanBytes2(memoryLayerOutput),
  137. ),
  138. slog.Group(
  139. "graph",
  140. // memory of graph when fully offloaded
  141. "full", format.HumanBytes2(graphFullOffload),
  142. // memory of graph when not fully offloaded
  143. "partial", format.HumanBytes2(graphPartialOffload),
  144. ),
  145. ),
  146. )
  147. if gpus[0].Library == "cpu" {
  148. return 0, 0, memoryRequiredTotal
  149. }
  150. if memoryRequiredPartial > memoryAvailable {
  151. slog.Debug("insufficient VRAM to load any model layers")
  152. return 0, 0, memoryRequiredTotal
  153. }
  154. return layerCount, memoryRequiredPartial, memoryRequiredTotal
  155. }