memory.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. package llm
  2. import (
  3. "log/slog"
  4. "strconv"
  5. "strings"
  6. "github.com/ollama/ollama/api"
  7. "github.com/ollama/ollama/format"
  8. "github.com/ollama/ollama/gpu"
  9. )
  10. // This algorithm looks for a complete fit to determine if we need to unload other models
  11. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  12. // Split up the GPUs by type and try them
  13. var estimatedVRAM uint64
  14. for _, gpus := range allGpus.ByLibrary() {
  15. var layerCount int
  16. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  17. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  18. if opts.NumGPU < 0 {
  19. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  20. return true, estimatedVRAM
  21. }
  22. } else {
  23. if layerCount > 0 && layerCount >= opts.NumGPU {
  24. return true, estimatedVRAM
  25. }
  26. }
  27. }
  28. return false, estimatedVRAM
  29. }
  30. type MemoryEstimate struct {
  31. // How many layers we predict we can load
  32. Layers int
  33. // The size of the graph which occupies the main GPU
  34. Graph uint64
  35. // How much VRAM will be allocated given the number of layers we predict
  36. VRAMSize uint64
  37. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  38. TotalSize uint64
  39. // For multi-GPU scenarios, this provides the tensor split parameter
  40. TensorSplit string
  41. // For multi-GPU scenarios, this is the size in bytes per GPU
  42. GPUSizes []uint64
  43. }
  44. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  45. // The GPUs provided must all be the same Library
  46. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
  47. // Graph size for a partial offload, applies to all GPUs
  48. var graphPartialOffload uint64
  49. // Graph size when all layers are offloaded, applies to all GPUs
  50. var graphFullOffload uint64
  51. // Final graph offload once we know full or partial
  52. var graphOffload uint64
  53. // Projectors loaded into GPU0 only
  54. var projectorSize uint64
  55. // Conditional output size on GPU 0
  56. var memoryLayerOutput uint64
  57. // The sizes of a layer
  58. var layerSize uint64
  59. // The sum of all the layer sizes (just for logging)
  60. var memoryWeights uint64
  61. // True if all the layers are loaded
  62. var fullyLoaded bool
  63. // Overflow that didn't fit into the GPU
  64. var overflow uint64
  65. availableList := make([]string, len(gpus))
  66. for i, gpu := range gpus {
  67. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  68. }
  69. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  70. for _, projector := range projectors {
  71. projectorSize += projectorMemoryRequirements(projector)
  72. // multimodal models require at least 2048 context
  73. opts.NumCtx = max(opts.NumCtx, 2048)
  74. }
  75. layers := ggml.Tensors().Layers()
  76. // add one layer worth of memory as a buffer
  77. if blk0, ok := layers["blk.0"]; ok {
  78. layerSize = blk0.size()
  79. } else {
  80. slog.Warn("model missing blk.0 layer size")
  81. }
  82. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  83. var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
  84. // KV is proportional to the number of layers
  85. layerSize += kv / ggml.KV().BlockCount()
  86. graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  87. if graphPartialOffload == 0 {
  88. graphPartialOffload = ggml.KV().GQA() * kv / 6
  89. }
  90. if graphFullOffload == 0 {
  91. graphFullOffload = graphPartialOffload
  92. }
  93. // on metal there's no partial offload overhead
  94. if gpus[0].Library == "metal" {
  95. graphPartialOffload = graphFullOffload
  96. } else if len(gpus) > 1 {
  97. // multigpu should always use the partial graph size
  98. graphFullOffload = graphPartialOffload
  99. }
  100. if layer, ok := layers["output_norm"]; ok {
  101. memoryLayerOutput += layer.size()
  102. }
  103. if layer, ok := layers["output"]; ok {
  104. memoryLayerOutput += layer.size()
  105. } else if layer, ok := layers["token_embd"]; ok {
  106. memoryLayerOutput += layer.size()
  107. }
  108. // Output layer handled at the end if we have space
  109. gpuZeroOverhead := projectorSize
  110. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  111. var layerCount int
  112. layerCounts := make([]int, len(gpus))
  113. gpuAllocations := make([]uint64, len(gpus))
  114. type gs struct {
  115. i int
  116. g *gpu.GpuInfo
  117. }
  118. gpusWithSpace := []gs{}
  119. for i := range gpus {
  120. var gzo uint64
  121. if len(gpusWithSpace) == 0 {
  122. gzo = gpuZeroOverhead
  123. }
  124. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  125. if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
  126. slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
  127. continue
  128. }
  129. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  130. gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
  131. }
  132. var gpuZeroID int
  133. if len(gpusWithSpace) > 0 {
  134. gpuZeroID = gpusWithSpace[0].i
  135. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  136. }
  137. // For all the layers, find where they can fit on the GPU(s)
  138. for i := range int(ggml.KV().BlockCount()) {
  139. memoryWeights += layerSize
  140. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  141. // Stop allocating on GPU(s) once we hit the users target NumGPU
  142. continue
  143. }
  144. // distribute the layers across the GPU(s) that have space
  145. for j := len(gpusWithSpace); j > 0; j-- {
  146. g := gpusWithSpace[i%j]
  147. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  148. if g.g.FreeMemory > used+layerSize {
  149. gpuAllocations[g.i] += layerSize
  150. layerCounts[g.i]++
  151. layerCount++
  152. break
  153. } else {
  154. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  155. }
  156. }
  157. }
  158. if layerCount >= int(ggml.KV().BlockCount()) {
  159. fullyLoaded = true
  160. } else {
  161. for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
  162. overflow += layerSize
  163. }
  164. }
  165. // Determine if we need to consider output then find where it fits
  166. if ((gpus[0].Library == "metal" && opts.UseMMap) || (gpus[0].Library != "metal" || !opts.UseMMap)) &&
  167. memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  168. for j := len(gpusWithSpace); j > 0; j-- {
  169. g := gpusWithSpace[layerCount%j]
  170. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  171. if g.g.FreeMemory > used+memoryLayerOutput {
  172. gpuAllocations[g.i] += memoryLayerOutput
  173. layerCounts[g.i]++
  174. layerCount++
  175. break
  176. }
  177. }
  178. if layerCount < int(ggml.KV().BlockCount())+1 {
  179. fullyLoaded = false
  180. overflow += memoryLayerOutput
  181. }
  182. }
  183. // Add the applicable (full or partial) graph allocations
  184. for i := range gpus {
  185. if layerCounts[i] <= 0 {
  186. continue
  187. }
  188. if fullyLoaded {
  189. gpuAllocations[i] += graphFullOffload
  190. } else {
  191. gpuAllocations[i] += graphPartialOffload
  192. }
  193. }
  194. if fullyLoaded {
  195. graphOffload = graphFullOffload
  196. } else {
  197. graphOffload = graphPartialOffload
  198. }
  199. // Summaries for the log
  200. var memoryRequiredPartial, memoryRequiredTotal uint64
  201. for i := range gpuAllocations {
  202. memoryRequiredPartial += gpuAllocations[i]
  203. }
  204. memoryRequiredTotal = memoryRequiredPartial + overflow
  205. tensorSplit := ""
  206. if len(gpus) > 1 {
  207. splits := make([]string, len(gpus))
  208. for i, count := range layerCounts {
  209. splits[i] = strconv.Itoa(count)
  210. }
  211. tensorSplit = strings.Join(splits, ",")
  212. }
  213. allocationsList := []string{}
  214. for _, a := range gpuAllocations {
  215. allocationsList = append(allocationsList, format.HumanBytes2(a))
  216. }
  217. slog.Info(
  218. "offload to gpu",
  219. slog.Group(
  220. "layers",
  221. // requested number of layers to offload
  222. "requested", opts.NumGPU,
  223. // The number of layers the model has (including output)
  224. "model", int(ggml.KV().BlockCount())+1,
  225. // estimated number of layers that can be offloaded
  226. "offload", layerCount,
  227. // multi-gpu split for tesnors
  228. "split", tensorSplit,
  229. ),
  230. slog.Group(
  231. "memory",
  232. // memory available by GPU for offloading
  233. "available", availableList,
  234. slog.Group(
  235. "required",
  236. // memory required for full offloading
  237. "full", format.HumanBytes2(memoryRequiredTotal),
  238. // memory required to offload layers.estimate layers
  239. "partial", format.HumanBytes2(memoryRequiredPartial),
  240. // memory of KV cache
  241. "kv", format.HumanBytes2(kv),
  242. // Allocations across the GPUs
  243. "allocations", allocationsList,
  244. ),
  245. slog.Group(
  246. "weights",
  247. // memory of the weights
  248. "total", format.HumanBytes2(memoryWeights),
  249. // memory of repeating layers
  250. "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
  251. // memory of non-repeating layers
  252. "nonrepeating", format.HumanBytes2(memoryLayerOutput),
  253. ),
  254. slog.Group(
  255. "graph",
  256. // memory of graph when fully offloaded
  257. "full", format.HumanBytes2(graphFullOffload),
  258. // memory of graph when not fully offloaded
  259. "partial", format.HumanBytes2(graphPartialOffload),
  260. ),
  261. ),
  262. )
  263. if gpus[0].Library == "cpu" {
  264. return MemoryEstimate{
  265. Layers: 0,
  266. Graph: 0,
  267. VRAMSize: 0,
  268. TotalSize: memoryRequiredTotal,
  269. GPUSizes: []uint64{},
  270. }
  271. }
  272. if layerCount == 0 {
  273. slog.Debug("insufficient VRAM to load any model layers")
  274. return MemoryEstimate{
  275. Layers: 0,
  276. Graph: 0,
  277. VRAMSize: 0,
  278. TotalSize: memoryRequiredTotal,
  279. GPUSizes: []uint64{},
  280. }
  281. }
  282. return MemoryEstimate{
  283. Layers: layerCount,
  284. Graph: graphOffload,
  285. VRAMSize: memoryRequiredPartial,
  286. TotalSize: memoryRequiredTotal,
  287. TensorSplit: tensorSplit,
  288. GPUSizes: gpuAllocations,
  289. }
  290. }