memory.go 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "strconv"
  6. "strings"
  7. "github.com/ollama/ollama/api"
  8. "github.com/ollama/ollama/format"
  9. "github.com/ollama/ollama/gpu"
  10. )
  11. // This algorithm looks for a complete fit to determine if we need to unload other models
  12. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  13. // Split up the GPUs by type and try them
  14. var estimatedVRAM uint64
  15. for _, gpus := range allGpus.ByLibrary() {
  16. var layerCount int
  17. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  18. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  19. if opts.NumGPU < 0 {
  20. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  21. return true, estimatedVRAM
  22. }
  23. } else {
  24. if layerCount > 0 && layerCount >= opts.NumGPU {
  25. return true, estimatedVRAM
  26. }
  27. }
  28. }
  29. return false, estimatedVRAM
  30. }
  31. type MemoryEstimate struct {
  32. // How many layers we predict we can load
  33. Layers int
  34. // The size of the graph which occupies the main GPU
  35. Graph uint64
  36. // How much VRAM will be allocated given the number of layers we predict
  37. VRAMSize uint64
  38. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  39. TotalSize uint64
  40. // For multi-GPU scenarios, this provides the tensor split parameter
  41. TensorSplit string
  42. // For multi-GPU scenarios, this is the size in bytes per GPU
  43. GPUSizes []uint64
  44. }
  45. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  46. // The GPUs provided must all be the same Library
  47. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
  48. // Graph size for a partial offload, applies to all GPUs
  49. var graphPartialOffload uint64
  50. // Graph size when all layers are offloaded, applies to all GPUs
  51. var graphFullOffload uint64
  52. // Final graph offload once we know full or partial
  53. var graphOffload uint64
  54. // Projectors loaded into GPU0 only
  55. var projectorSize uint64
  56. // Conditional output size on GPU 0
  57. var memoryLayerOutput uint64
  58. var includeOutput bool
  59. // One extra layer as a pad for each GPU
  60. var layerBuffer uint64
  61. // The sizes of the main layers
  62. var layerSizes []uint64
  63. // The sum of all the layer sizes (just for logging)
  64. var memoryWeights uint64
  65. // True if all the layers are loaded
  66. var fullyLoaded bool
  67. // Overflow that didn't fit into the GPU
  68. var overflow uint64
  69. availableList := make([]string, len(gpus))
  70. for i, gpu := range gpus {
  71. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  72. }
  73. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  74. for _, projector := range projectors {
  75. projectorSize += projectorMemoryRequirements(projector)
  76. // multimodal models require at least 2048 context
  77. opts.NumCtx = max(opts.NumCtx, 2048)
  78. }
  79. layers := ggml.Tensors().Layers()
  80. // add one layer worth of memory as a buffer
  81. if blk0, ok := layers["blk.0"]; ok {
  82. layerBuffer = blk0.size()
  83. }
  84. // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
  85. var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
  86. graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  87. if graphPartialOffload == 0 {
  88. graphPartialOffload = ggml.KV().GQA() * kv / 6
  89. }
  90. if graphFullOffload == 0 {
  91. graphFullOffload = graphPartialOffload
  92. }
  93. // on metal there's no partial offload overhead
  94. if gpus[0].Library == "metal" {
  95. graphPartialOffload = graphFullOffload
  96. }
  97. if layer, ok := layers["output_norm"]; ok {
  98. memoryLayerOutput += layer.size()
  99. }
  100. if layer, ok := layers["output"]; ok {
  101. memoryLayerOutput += layer.size()
  102. } else if layer, ok := layers["token_embd"]; ok {
  103. memoryLayerOutput += layer.size()
  104. }
  105. if gpus[0].Library == "metal" && opts.UseMMap {
  106. includeOutput = true
  107. } else if gpus[0].Library != "metal" || !opts.UseMMap {
  108. includeOutput = true
  109. }
  110. gpuZeroOverhead := projectorSize
  111. if includeOutput {
  112. gpuZeroOverhead += memoryLayerOutput
  113. }
  114. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  115. var layerCount int
  116. layerCounts := make([]int, len(gpus))
  117. gpuAllocations := make([]uint64, len(gpus))
  118. type gs struct {
  119. i int
  120. g *gpu.GpuInfo
  121. }
  122. gpusWithSpace := []gs{}
  123. for i := range gpus {
  124. var gzo uint64
  125. if len(gpusWithSpace) == 0 {
  126. gzo = gpuZeroOverhead
  127. }
  128. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  129. if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
  130. slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
  131. continue
  132. }
  133. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  134. gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
  135. }
  136. var gpuZeroID int
  137. if len(gpusWithSpace) > 0 {
  138. gpuZeroID = gpusWithSpace[0].i
  139. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  140. }
  141. layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
  142. for i := range int(ggml.KV().BlockCount()) {
  143. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  144. memoryLayer := blk.size()
  145. // KV is proportional to the number of layers
  146. memoryLayer += kv / ggml.KV().BlockCount()
  147. layerSizes[i] = memoryLayer
  148. memoryWeights += memoryLayer
  149. }
  150. }
  151. // For all the layers, find where they can fit on the GPU(s)
  152. for i := range layerSizes {
  153. if layerSizes[i] == 0 {
  154. continue
  155. }
  156. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  157. // Stop allocating on GPU(s) once we hit the users target NumGPU
  158. continue
  159. }
  160. // distribute the layers across the GPU(s) that have space
  161. for j := len(gpusWithSpace); j > 0; j-- {
  162. g := gpusWithSpace[i%j]
  163. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  164. if g.g.FreeMemory > used+layerSizes[i] {
  165. gpuAllocations[g.i] += layerSizes[i]
  166. layerCounts[g.i]++
  167. layerCount++
  168. break
  169. } else {
  170. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  171. }
  172. }
  173. }
  174. if layerCount >= int(ggml.KV().BlockCount()) {
  175. fullyLoaded = true
  176. } else {
  177. for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
  178. overflow += layerSizes[i]
  179. }
  180. }
  181. // Find where the output fits
  182. if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  183. for j := len(gpusWithSpace); j > 0; j-- {
  184. g := gpusWithSpace[layerCount%j]
  185. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  186. if g.g.FreeMemory > used+memoryLayerOutput {
  187. gpuAllocations[g.i] += memoryLayerOutput
  188. layerCounts[g.i]++
  189. layerCount++
  190. break
  191. }
  192. }
  193. if layerCount < int(ggml.KV().BlockCount())+1 {
  194. fullyLoaded = false
  195. overflow += memoryLayerOutput
  196. }
  197. }
  198. // Add the applicable (full or partial) graph allocations
  199. for i := range gpus {
  200. if layerCounts[i] <= 0 {
  201. continue
  202. }
  203. if fullyLoaded {
  204. gpuAllocations[i] += graphFullOffload
  205. } else {
  206. gpuAllocations[i] += graphPartialOffload
  207. }
  208. }
  209. if fullyLoaded {
  210. graphOffload = graphFullOffload
  211. } else {
  212. graphOffload = graphPartialOffload
  213. }
  214. // Summaries for the log
  215. var memoryRequiredPartial, memoryRequiredTotal uint64
  216. for i := range gpuAllocations {
  217. memoryRequiredPartial += gpuAllocations[i]
  218. }
  219. memoryRequiredTotal = memoryRequiredPartial + overflow
  220. tensorSplit := ""
  221. if len(gpus) > 1 {
  222. splits := make([]string, len(gpus))
  223. for i, count := range layerCounts {
  224. splits[i] = strconv.Itoa(count)
  225. }
  226. tensorSplit = strings.Join(splits, ",")
  227. }
  228. allocationsList := []string{}
  229. for _, a := range gpuAllocations {
  230. allocationsList = append(allocationsList, format.HumanBytes2(a))
  231. }
  232. slog.Info(
  233. "offload to gpu",
  234. slog.Group(
  235. "layers",
  236. // requested number of layers to offload
  237. "requested", opts.NumGPU,
  238. // The number of layers the model has (including output)
  239. "model", int(ggml.KV().BlockCount())+1,
  240. // estimated number of layers that can be offloaded
  241. "offload", layerCount,
  242. // multi-gpu split for tesnors
  243. "split", tensorSplit,
  244. ),
  245. slog.Group(
  246. "memory",
  247. // memory available by GPU for offloading
  248. "available", availableList,
  249. slog.Group(
  250. "required",
  251. // memory required for full offloading
  252. "full", format.HumanBytes2(memoryRequiredTotal),
  253. // memory required to offload layers.estimate layers
  254. "partial", format.HumanBytes2(memoryRequiredPartial),
  255. // memory of KV cache
  256. "kv", format.HumanBytes2(kv),
  257. // Allocations across the GPUs
  258. "allocations", allocationsList,
  259. ),
  260. slog.Group(
  261. "weights",
  262. // memory of the weights
  263. "total", format.HumanBytes2(memoryWeights),
  264. // memory of repeating layers
  265. "repeating", format.HumanBytes2(memoryWeights-memoryLayerOutput),
  266. // memory of non-repeating layers
  267. "nonrepeating", format.HumanBytes2(memoryLayerOutput),
  268. ),
  269. slog.Group(
  270. "graph",
  271. // memory of graph when fully offloaded
  272. "full", format.HumanBytes2(graphFullOffload),
  273. // memory of graph when not fully offloaded
  274. "partial", format.HumanBytes2(graphPartialOffload),
  275. ),
  276. ),
  277. )
  278. if gpus[0].Library == "cpu" {
  279. return MemoryEstimate{
  280. Layers: 0,
  281. Graph: 0,
  282. VRAMSize: 0,
  283. TotalSize: memoryRequiredTotal,
  284. GPUSizes: []uint64{},
  285. }
  286. }
  287. if layerCount == 0 {
  288. slog.Debug("insufficient VRAM to load any model layers")
  289. return MemoryEstimate{
  290. Layers: 0,
  291. Graph: 0,
  292. VRAMSize: 0,
  293. TotalSize: memoryRequiredTotal,
  294. GPUSizes: []uint64{},
  295. }
  296. }
  297. return MemoryEstimate{
  298. Layers: layerCount,
  299. Graph: graphOffload,
  300. VRAMSize: memoryRequiredPartial,
  301. TotalSize: memoryRequiredTotal,
  302. TensorSplit: tensorSplit,
  303. GPUSizes: gpuAllocations,
  304. }
  305. }