memory.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "strconv"
  6. "strings"
  7. "github.com/ollama/ollama/api"
  8. "github.com/ollama/ollama/format"
  9. "github.com/ollama/ollama/gpu"
  10. )
  11. // This algorithm looks for a complete fit to determine if we need to unload other models
  12. func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  13. // Split up the GPUs by type and try them
  14. var estimatedVRAM uint64
  15. for _, gpus := range allGpus.ByLibrary() {
  16. var layerCount int
  17. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  18. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  19. if opts.NumGPU < 0 {
  20. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  21. return true, estimatedVRAM
  22. }
  23. } else {
  24. if layerCount > 0 && layerCount >= opts.NumGPU {
  25. return true, estimatedVRAM
  26. }
  27. }
  28. }
  29. return false, estimatedVRAM
  30. }
  31. type MemoryEstimate struct {
  32. // How many layers we predict we can load
  33. Layers int
  34. // The size of the graph which occupies the main GPU
  35. Graph uint64
  36. // How much VRAM will be allocated given the number of layers we predict
  37. VRAMSize uint64
  38. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  39. TotalSize uint64
  40. // For multi-GPU scenarios, this provides the tensor split parameter
  41. TensorSplit string
  42. // For multi-GPU scenarios, this is the size in bytes per GPU
  43. GPUSizes []uint64
  44. // internal fields for logging purposes
  45. inferenceLibrary string
  46. layersRequested int
  47. layersModel int
  48. availableList []string
  49. kv uint64
  50. allocationsList []string
  51. memoryWeights uint64
  52. memoryLayerOutput uint64
  53. graphFullOffload uint64
  54. graphPartialOffload uint64
  55. }
  56. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  57. // The GPUs provided must all be the same Library
  58. func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
  59. // Graph size for a partial offload, applies to all GPUs
  60. var graphPartialOffload uint64
  61. // Graph size when all layers are offloaded, applies to all GPUs
  62. var graphFullOffload uint64
  63. // Final graph offload once we know full or partial
  64. var graphOffload uint64
  65. // Projectors loaded into GPU0 only
  66. var projectorSize uint64
  67. // Conditional output size on GPU 0
  68. var memoryLayerOutput uint64
  69. // The sizes of a layer
  70. var baseLayerSize uint64
  71. // The sum of all the layer sizes (just for logging)
  72. var memoryWeights uint64
  73. // True if all the layers are loaded
  74. var fullyLoaded bool
  75. // Overflow that didn't fit into the GPU
  76. var overflow uint64
  77. availableList := make([]string, len(gpus))
  78. for i, gpu := range gpus {
  79. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  80. }
  81. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  82. for _, projector := range projectors {
  83. projectorSize += projectorMemoryRequirements(projector)
  84. // multimodal models require at least 2048 context
  85. opts.NumCtx = max(opts.NumCtx, 2048)
  86. }
  87. layers := ggml.Tensors().Layers()
  88. // add one layer worth of memory as a buffer
  89. if blk0, ok := layers["blk.0"]; ok {
  90. baseLayerSize = blk0.size()
  91. } else {
  92. slog.Warn("model missing blk.0 layer size")
  93. }
  94. // fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
  95. kv := 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
  96. layerKV := kv / ggml.KV().BlockCount()
  97. baseLayerSize += layerKV
  98. graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  99. if graphPartialOffload == 0 {
  100. graphPartialOffload = ggml.KV().GQA() * kv / 6
  101. }
  102. if graphFullOffload == 0 {
  103. graphFullOffload = graphPartialOffload
  104. }
  105. if gpus[0].Library == "metal" {
  106. // there's no partial offload overhead on metal
  107. graphPartialOffload = graphFullOffload
  108. } else if len(gpus) > 1 {
  109. // multigpu should always use the partial graph size
  110. graphFullOffload = graphPartialOffload
  111. }
  112. if layer, ok := layers["output_norm"]; ok {
  113. memoryLayerOutput += layer.size()
  114. }
  115. if layer, ok := layers["output"]; ok {
  116. memoryLayerOutput += layer.size()
  117. } else if layer, ok := layers["token_embd"]; ok {
  118. memoryLayerOutput += layer.size()
  119. }
  120. // Output layer handled at the end if we have space
  121. gpuZeroOverhead := projectorSize
  122. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  123. var layerCount int
  124. layerCounts := make([]int, len(gpus))
  125. gpuAllocations := make([]uint64, len(gpus))
  126. type gs struct {
  127. i int
  128. g *gpu.GpuInfo
  129. }
  130. gpusWithSpace := []gs{}
  131. for i := range gpus {
  132. var gzo uint64
  133. if len(gpusWithSpace) == 0 {
  134. gzo = gpuZeroOverhead
  135. }
  136. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  137. if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*baseLayerSize {
  138. slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
  139. continue
  140. }
  141. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  142. gpuAllocations[i] += gpus[i].MinimumMemory + baseLayerSize // We hold off on graph until we know partial vs. full
  143. }
  144. var gpuZeroID int
  145. if len(gpusWithSpace) > 0 {
  146. gpuZeroID = gpusWithSpace[0].i
  147. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  148. }
  149. // For all the layers, find where they can fit on the GPU(s)
  150. for i := range int(ggml.KV().BlockCount()) {
  151. var layerSize uint64
  152. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  153. layerSize = blk.size()
  154. } else {
  155. slog.Error("missing layer", "blk", i)
  156. continue
  157. }
  158. memoryWeights += layerSize
  159. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  160. // Stop allocating on GPU(s) once we hit the users target NumGPU
  161. continue
  162. }
  163. // distribute the layers across the GPU(s) that have space
  164. for j := len(gpusWithSpace); j > 0; j-- {
  165. g := gpusWithSpace[i%j]
  166. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  167. if g.g.FreeMemory > used+layerSize+layerKV {
  168. gpuAllocations[g.i] += layerSize + layerKV
  169. layerCounts[g.i]++
  170. layerCount++
  171. break
  172. } else {
  173. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  174. }
  175. }
  176. }
  177. if layerCount >= int(ggml.KV().BlockCount()) {
  178. fullyLoaded = true
  179. } else {
  180. for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
  181. overflow += baseLayerSize
  182. }
  183. }
  184. // Determine if we need to consider output then find where it fits
  185. if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  186. for j := len(gpusWithSpace); j > 0; j-- {
  187. g := gpusWithSpace[layerCount%j]
  188. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  189. if g.g.FreeMemory > used+memoryLayerOutput {
  190. gpuAllocations[g.i] += memoryLayerOutput
  191. layerCounts[g.i]++
  192. layerCount++
  193. break
  194. }
  195. }
  196. if layerCount < int(ggml.KV().BlockCount())+1 {
  197. fullyLoaded = false
  198. overflow += memoryLayerOutput
  199. }
  200. }
  201. // Add the applicable (full or partial) graph allocations
  202. for i := range gpus {
  203. if layerCounts[i] <= 0 {
  204. continue
  205. }
  206. if fullyLoaded {
  207. gpuAllocations[i] += graphFullOffload
  208. } else {
  209. gpuAllocations[i] += graphPartialOffload
  210. }
  211. }
  212. if fullyLoaded {
  213. graphOffload = graphFullOffload
  214. } else {
  215. graphOffload = graphPartialOffload
  216. }
  217. // Summaries for the log
  218. var memoryRequiredPartial, memoryRequiredTotal uint64
  219. for i := range gpuAllocations {
  220. memoryRequiredPartial += gpuAllocations[i]
  221. }
  222. memoryRequiredTotal = memoryRequiredPartial + overflow
  223. tensorSplit := ""
  224. if len(gpus) > 1 {
  225. splits := make([]string, len(gpus))
  226. for i, count := range layerCounts {
  227. splits[i] = strconv.Itoa(count)
  228. }
  229. tensorSplit = strings.Join(splits, ",")
  230. }
  231. allocationsList := make([]string, len(gpuAllocations))
  232. for i, a := range gpuAllocations {
  233. allocationsList[i] = format.HumanBytes2(a)
  234. }
  235. estimate := MemoryEstimate{
  236. TotalSize: memoryRequiredTotal,
  237. Layers: 0,
  238. Graph: 0,
  239. VRAMSize: 0,
  240. GPUSizes: []uint64{},
  241. inferenceLibrary: gpus[0].Library,
  242. layersRequested: opts.NumGPU,
  243. layersModel: int(ggml.KV().BlockCount()) + 1,
  244. availableList: availableList,
  245. kv: kv,
  246. allocationsList: allocationsList,
  247. memoryWeights: memoryWeights,
  248. memoryLayerOutput: memoryLayerOutput,
  249. graphFullOffload: graphFullOffload,
  250. graphPartialOffload: graphPartialOffload,
  251. }
  252. if gpus[0].Library == "cpu" {
  253. return estimate
  254. }
  255. if layerCount == 0 {
  256. slog.Debug("insufficient VRAM to load any model layers")
  257. return estimate
  258. }
  259. estimate.Layers = layerCount
  260. estimate.Graph = graphOffload
  261. estimate.VRAMSize = memoryRequiredPartial
  262. estimate.TotalSize = memoryRequiredTotal
  263. estimate.TensorSplit = tensorSplit
  264. estimate.GPUSizes = gpuAllocations
  265. return estimate
  266. }
  267. func (m MemoryEstimate) log() {
  268. slog.Info(
  269. "offload to "+m.inferenceLibrary,
  270. slog.Group(
  271. "layers",
  272. // requested number of layers to offload
  273. "requested", m.layersRequested,
  274. // The number of layers the model has (including output)
  275. "model", m.layersModel,
  276. // estimated number of layers that can be offloaded
  277. "offload", m.Layers,
  278. // multi-gpu split for tensors
  279. "split", m.TensorSplit,
  280. ),
  281. slog.Group(
  282. "memory",
  283. // memory available by GPU for offloading
  284. "available", m.availableList,
  285. slog.Group(
  286. "required",
  287. // memory required for full offloading
  288. "full", format.HumanBytes2(m.TotalSize),
  289. // memory required to offload layers.estimate layers
  290. "partial", format.HumanBytes2(m.VRAMSize),
  291. // memory of KV cache
  292. "kv", format.HumanBytes2(m.kv),
  293. // Allocations across the GPUs
  294. "allocations", m.allocationsList,
  295. ),
  296. slog.Group(
  297. "weights",
  298. // memory of the weights
  299. "total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
  300. // memory of repeating layers
  301. "repeating", format.HumanBytes2(m.memoryWeights),
  302. // memory of non-repeating layers
  303. "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
  304. ),
  305. slog.Group(
  306. "graph",
  307. // memory of graph when fully offloaded
  308. "full", format.HumanBytes2(m.graphFullOffload),
  309. // memory of graph when not fully offloaded
  310. "partial", format.HumanBytes2(m.graphPartialOffload),
  311. ),
  312. ),
  313. )
  314. }