memory.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "strconv"
  6. "strings"
  7. "github.com/ollama/ollama/api"
  8. "github.com/ollama/ollama/envconfig"
  9. "github.com/ollama/ollama/format"
  10. "github.com/ollama/ollama/gpu"
  11. )
  12. // This algorithm looks for a complete fit to determine if we need to unload other models
  13. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  14. // Split up the GPUs by type and try them
  15. var estimatedVRAM uint64
  16. for _, gpus := range allGpus.ByLibrary() {
  17. var layerCount int
  18. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  19. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  20. if opts.NumGPU < 0 {
  21. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  22. return true, estimatedVRAM
  23. }
  24. } else {
  25. if layerCount > 0 && layerCount >= opts.NumGPU {
  26. return true, estimatedVRAM
  27. }
  28. }
  29. }
  30. return false, estimatedVRAM
  31. }
  32. type MemoryEstimate struct {
  33. // How many layers we predict we can load
  34. Layers int
  35. // The size of the graph which occupies the main GPU
  36. Graph uint64
  37. // How much VRAM will be allocated given the number of layers we predict
  38. VRAMSize uint64
  39. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  40. TotalSize uint64
  41. // For multi-GPU scenarios, this provides the tensor split parameter
  42. TensorSplit string
  43. // For multi-GPU scenarios, this is the size in bytes per GPU
  44. GPUSizes []uint64
  45. // internal fields for logging purposes
  46. inferenceLibrary string
  47. layersRequested int
  48. layersModel int
  49. availableList []string
  50. kv uint64
  51. allocationsList []string
  52. memoryWeights uint64
  53. memoryLayerOutput uint64
  54. graphFullOffload uint64
  55. graphPartialOffload uint64
  56. }
  57. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  58. // The GPUs provided must all be the same Library
  59. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
  60. // Graph size for a partial offload, applies to all GPUs
  61. var graphPartialOffload uint64
  62. // Graph size when all layers are offloaded, applies to all GPUs
  63. var graphFullOffload uint64
  64. // Final graph offload once we know full or partial
  65. var graphOffload uint64
  66. // Projectors loaded into GPU0 only
  67. var projectorSize uint64
  68. // Conditional output size on GPU 0
  69. var memoryLayerOutput uint64
  70. // The sizes of a layer
  71. var layerSize uint64
  72. // The sum of all the layer sizes (just for logging)
  73. var memoryWeights uint64
  74. // True if all the layers are loaded
  75. var fullyLoaded bool
  76. // Overflow that didn't fit into the GPU
  77. var overflow uint64
  78. overhead := envconfig.GPUOverhead()
  79. availableList := make([]string, len(gpus))
  80. for i, gpu := range gpus {
  81. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  82. }
  83. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  84. for _, projector := range projectors {
  85. projectorSize += projectorMemoryRequirements(projector)
  86. // multimodal models require at least 2048 context
  87. opts.NumCtx = max(opts.NumCtx, 2048)
  88. }
  89. layers := ggml.Tensors().Layers()
  90. // add one layer worth of memory as a buffer
  91. if blk0, ok := layers["blk.0"]; ok {
  92. layerSize = blk0.size()
  93. } else {
  94. slog.Warn("model missing blk.0 layer size")
  95. }
  96. // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
  97. var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
  98. // KV is proportional to the number of layers
  99. layerSize += kv / ggml.KV().BlockCount()
  100. graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  101. if graphPartialOffload == 0 {
  102. graphPartialOffload = ggml.KV().GQA() * kv / 6
  103. }
  104. if graphFullOffload == 0 {
  105. graphFullOffload = graphPartialOffload
  106. }
  107. // on metal there's no partial offload overhead
  108. if gpus[0].Library == "metal" {
  109. graphPartialOffload = graphFullOffload
  110. } else if len(gpus) > 1 {
  111. // multigpu should always use the partial graph size
  112. graphFullOffload = graphPartialOffload
  113. }
  114. if layer, ok := layers["output_norm"]; ok {
  115. memoryLayerOutput += layer.size()
  116. }
  117. if layer, ok := layers["output"]; ok {
  118. memoryLayerOutput += layer.size()
  119. } else if layer, ok := layers["token_embd"]; ok {
  120. memoryLayerOutput += layer.size()
  121. }
  122. // Output layer handled at the end if we have space
  123. gpuZeroOverhead := projectorSize
  124. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  125. var layerCount int
  126. layerCounts := make([]int, len(gpus))
  127. gpuAllocations := make([]uint64, len(gpus))
  128. type gs struct {
  129. i int
  130. g *gpu.GpuInfo
  131. }
  132. gpusWithSpace := []gs{}
  133. for i := range gpus {
  134. var gzo uint64
  135. if len(gpusWithSpace) == 0 {
  136. gzo = gpuZeroOverhead
  137. }
  138. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  139. if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
  140. slog.Debug("gpu has too little memory to allocate any layers",
  141. "id", gpus[i].ID,
  142. "library", gpus[i].Library,
  143. "variant", gpus[i].Variant,
  144. "compute", gpus[i].Compute,
  145. "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
  146. "name", gpus[i].Name,
  147. "total", format.HumanBytes2(gpus[i].TotalMemory),
  148. "available", format.HumanBytes2(gpus[i].FreeMemory),
  149. "minimum_memory", gpus[i].MinimumMemory,
  150. "layer_size", format.HumanBytes2(layerSize),
  151. "gpu_zer_overhead", format.HumanBytes2(gzo),
  152. "partial_offload", format.HumanBytes2(graphPartialOffload),
  153. "full_offload", format.HumanBytes2(graphFullOffload),
  154. )
  155. continue
  156. }
  157. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  158. gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
  159. }
  160. var gpuZeroID int
  161. if len(gpusWithSpace) > 0 {
  162. gpuZeroID = gpusWithSpace[0].i
  163. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  164. }
  165. // For all the layers, find where they can fit on the GPU(s)
  166. for i := range int(ggml.KV().BlockCount()) {
  167. // Some models have inconsistent layer sizes
  168. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  169. layerSize = blk.size()
  170. layerSize += kv / ggml.KV().BlockCount()
  171. }
  172. memoryWeights += layerSize
  173. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  174. // Stop allocating on GPU(s) once we hit the users target NumGPU
  175. continue
  176. }
  177. // distribute the layers across the GPU(s) that have space
  178. for j := len(gpusWithSpace); j > 0; j-- {
  179. g := gpusWithSpace[i%j]
  180. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  181. if (g.g.FreeMemory - overhead) > used+layerSize {
  182. gpuAllocations[g.i] += layerSize
  183. layerCounts[g.i]++
  184. layerCount++
  185. break
  186. } else {
  187. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  188. }
  189. }
  190. }
  191. if layerCount >= int(ggml.KV().BlockCount()) {
  192. fullyLoaded = true
  193. } else {
  194. for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
  195. overflow += layerSize
  196. }
  197. }
  198. // Determine if we need to consider output then find where it fits
  199. if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  200. for j := len(gpusWithSpace); j > 0; j-- {
  201. g := gpusWithSpace[layerCount%j]
  202. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  203. if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
  204. gpuAllocations[g.i] += memoryLayerOutput
  205. layerCounts[g.i]++
  206. layerCount++
  207. break
  208. }
  209. }
  210. if layerCount < int(ggml.KV().BlockCount())+1 {
  211. fullyLoaded = false
  212. overflow += memoryLayerOutput
  213. }
  214. }
  215. // Add the applicable (full or partial) graph allocations
  216. for i := range gpus {
  217. if layerCounts[i] <= 0 {
  218. continue
  219. }
  220. if fullyLoaded {
  221. gpuAllocations[i] += graphFullOffload
  222. } else {
  223. gpuAllocations[i] += graphPartialOffload
  224. }
  225. }
  226. if fullyLoaded {
  227. graphOffload = graphFullOffload
  228. } else {
  229. graphOffload = graphPartialOffload
  230. }
  231. // Summaries for the log
  232. var memoryRequiredPartial, memoryRequiredTotal uint64
  233. for i := range gpuAllocations {
  234. memoryRequiredPartial += gpuAllocations[i]
  235. }
  236. memoryRequiredTotal = memoryRequiredPartial + overflow
  237. tensorSplit := ""
  238. if len(gpus) > 1 {
  239. splits := make([]string, len(gpus))
  240. for i, count := range layerCounts {
  241. splits[i] = strconv.Itoa(count)
  242. }
  243. tensorSplit = strings.Join(splits, ",")
  244. }
  245. allocationsList := []string{}
  246. for _, a := range gpuAllocations {
  247. allocationsList = append(allocationsList, format.HumanBytes2(a))
  248. }
  249. estimate := MemoryEstimate{
  250. TotalSize: memoryRequiredTotal,
  251. Layers: 0,
  252. Graph: 0,
  253. VRAMSize: 0,
  254. GPUSizes: []uint64{},
  255. inferenceLibrary: gpus[0].Library,
  256. layersRequested: opts.NumGPU,
  257. layersModel: int(ggml.KV().BlockCount()) + 1,
  258. availableList: availableList,
  259. kv: kv,
  260. allocationsList: allocationsList,
  261. memoryWeights: memoryWeights,
  262. memoryLayerOutput: memoryLayerOutput,
  263. graphFullOffload: graphFullOffload,
  264. graphPartialOffload: graphPartialOffload,
  265. }
  266. if gpus[0].Library == "cpu" {
  267. return estimate
  268. }
  269. if layerCount == 0 {
  270. slog.Debug("insufficient VRAM to load any model layers")
  271. return estimate
  272. }
  273. estimate.Layers = layerCount
  274. estimate.Graph = graphOffload
  275. estimate.VRAMSize = memoryRequiredPartial
  276. estimate.TotalSize = memoryRequiredTotal
  277. estimate.TensorSplit = tensorSplit
  278. estimate.GPUSizes = gpuAllocations
  279. return estimate
  280. }
  281. func (m MemoryEstimate) log() {
  282. overhead := envconfig.GPUOverhead()
  283. slog.Info(
  284. "offload to "+m.inferenceLibrary,
  285. slog.Group(
  286. "layers",
  287. // requested number of layers to offload
  288. "requested", m.layersRequested,
  289. // The number of layers the model has (including output)
  290. "model", m.layersModel,
  291. // estimated number of layers that can be offloaded
  292. "offload", m.Layers,
  293. // multi-gpu split for tensors
  294. "split", m.TensorSplit,
  295. ),
  296. slog.Group(
  297. "memory",
  298. // memory available by GPU for offloading
  299. "available", m.availableList,
  300. "gpu_overhead", format.HumanBytes2(overhead),
  301. slog.Group(
  302. "required",
  303. // memory required for full offloading
  304. "full", format.HumanBytes2(m.TotalSize),
  305. // memory required to offload layers.estimate layers
  306. "partial", format.HumanBytes2(m.VRAMSize),
  307. // memory of KV cache
  308. "kv", format.HumanBytes2(m.kv),
  309. // Allocations across the GPUs
  310. "allocations", m.allocationsList,
  311. ),
  312. slog.Group(
  313. "weights",
  314. // memory of the weights
  315. "total", format.HumanBytes2(m.memoryWeights),
  316. // memory of repeating layers
  317. "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
  318. // memory of non-repeating layers
  319. "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
  320. ),
  321. slog.Group(
  322. "graph",
  323. // memory of graph when fully offloaded
  324. "full", format.HumanBytes2(m.graphFullOffload),
  325. // memory of graph when not fully offloaded
  326. "partial", format.HumanBytes2(m.graphPartialOffload),
  327. ),
  328. ),
  329. )
  330. }