memory.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "github.com/ollama/ollama/api"
  9. "github.com/ollama/ollama/discover"
  10. "github.com/ollama/ollama/envconfig"
  11. "github.com/ollama/ollama/format"
  12. )
  13. // This algorithm looks for a complete fit to determine if we need to unload other models
  14. func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
  15. // Split up the GPUs by type and try them
  16. var estimatedVRAM uint64
  17. for _, gpus := range allGpus.ByLibrary() {
  18. var layerCount int
  19. estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
  20. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  21. if opts.NumGPU < 0 {
  22. if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
  23. return true, estimatedVRAM
  24. }
  25. } else {
  26. if layerCount > 0 && layerCount >= opts.NumGPU {
  27. return true, estimatedVRAM
  28. }
  29. }
  30. }
  31. return false, estimatedVRAM
  32. }
  33. type MemoryEstimate struct {
  34. // How many layers we predict we can load
  35. Layers int
  36. // The size of the graph which occupies the main GPU
  37. Graph uint64
  38. // How much VRAM will be allocated given the number of layers we predict
  39. VRAMSize uint64
  40. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  41. TotalSize uint64
  42. // For multi-GPU scenarios, this provides the tensor split parameter
  43. TensorSplit string
  44. // For multi-GPU scenarios, this is the size in bytes per GPU
  45. GPUSizes []uint64
  46. // internal fields for logging purposes
  47. inferenceLibrary string
  48. layersRequested int
  49. layersModel int
  50. availableList []string
  51. kv uint64
  52. allocationsList []string
  53. memoryWeights uint64
  54. memoryLayerOutput uint64
  55. graphFullOffload uint64
  56. graphPartialOffload uint64
  57. projectorWeights, projectorGraph uint64
  58. }
  59. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  60. // The GPUs provided must all be the same Library
  61. func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
  62. // Graph size for a partial offload, applies to all GPUs
  63. var graphPartialOffload uint64
  64. // Graph size when all layers are offloaded, applies to all GPUs
  65. var graphFullOffload uint64
  66. // Final graph offload once we know full or partial
  67. var graphOffload uint64
  68. // Projectors loaded into GPU0 only
  69. var projectorWeights uint64
  70. var projectorGraph uint64
  71. // Conditional output size on GPU 0
  72. var memoryLayerOutput uint64
  73. // The sizes of a layer
  74. var layerSize uint64
  75. // The sum of all the layer sizes (just for logging)
  76. var memoryWeights uint64
  77. // True if all the layers are loaded
  78. var fullyLoaded bool
  79. // Overflow that didn't fit into the GPU
  80. var overflow uint64
  81. overhead := envconfig.GpuOverhead()
  82. availableList := make([]string, len(gpus))
  83. for i, gpu := range gpus {
  84. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  85. }
  86. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  87. for _, projector := range projectors {
  88. weight, graph := projectorMemoryRequirements(projector)
  89. projectorWeights += weight
  90. projectorGraph += graph
  91. // multimodal models require at least 2048 context
  92. opts.NumCtx = max(opts.NumCtx, 2048)
  93. }
  94. layers := ggml.Tensors().Layers()
  95. // add one layer worth of memory as a buffer
  96. if blk0, ok := layers["blk.0"]; ok {
  97. layerSize = blk0.size()
  98. } else {
  99. slog.Warn("model missing blk.0 layer size")
  100. }
  101. kv, graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
  102. if graphPartialOffload == 0 {
  103. graphPartialOffload = ggml.KV().GQA() * kv / 6
  104. }
  105. if graphFullOffload == 0 {
  106. graphFullOffload = graphPartialOffload
  107. }
  108. // KV is proportional to the number of layers
  109. layerSize += kv / ggml.KV().BlockCount()
  110. // on metal there's no partial offload overhead
  111. if gpus[0].Library == "metal" {
  112. graphPartialOffload = graphFullOffload
  113. } else if len(gpus) > 1 {
  114. // multigpu should always use the partial graph size
  115. graphFullOffload = graphPartialOffload
  116. }
  117. if layer, ok := layers["output_norm"]; ok {
  118. memoryLayerOutput += layer.size()
  119. }
  120. if layer, ok := layers["output"]; ok {
  121. memoryLayerOutput += layer.size()
  122. } else if layer, ok := layers["token_embd"]; ok {
  123. memoryLayerOutput += layer.size()
  124. }
  125. // Output layer handled at the end if we have space
  126. gpuZeroOverhead := projectorWeights + projectorGraph
  127. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  128. var layerCount int
  129. layerCounts := make([]int, len(gpus))
  130. gpuAllocations := make([]uint64, len(gpus))
  131. type gs struct {
  132. i int
  133. g *discover.GpuInfo
  134. }
  135. gpusWithSpace := []gs{}
  136. for i := range gpus {
  137. var gzo uint64
  138. if len(gpusWithSpace) == 0 {
  139. gzo = gpuZeroOverhead
  140. }
  141. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  142. if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
  143. slog.Debug("gpu has too little memory to allocate any layers",
  144. "id", gpus[i].ID,
  145. "library", gpus[i].Library,
  146. "variant", gpus[i].Variant,
  147. "compute", gpus[i].Compute,
  148. "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
  149. "name", gpus[i].Name,
  150. "total", format.HumanBytes2(gpus[i].TotalMemory),
  151. "available", format.HumanBytes2(gpus[i].FreeMemory),
  152. "minimum_memory", gpus[i].MinimumMemory,
  153. "layer_size", format.HumanBytes2(layerSize),
  154. "gpu_zer_overhead", format.HumanBytes2(gzo),
  155. "partial_offload", format.HumanBytes2(graphPartialOffload),
  156. "full_offload", format.HumanBytes2(graphFullOffload),
  157. )
  158. continue
  159. }
  160. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  161. gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
  162. }
  163. var gpuZeroID int
  164. if len(gpusWithSpace) > 0 {
  165. gpuZeroID = gpusWithSpace[0].i
  166. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  167. }
  168. // For all the layers, find where they can fit on the GPU(s)
  169. for i := range int(ggml.KV().BlockCount()) {
  170. // Some models have inconsistent layer sizes
  171. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  172. layerSize = blk.size()
  173. layerSize += kv / ggml.KV().BlockCount()
  174. }
  175. memoryWeights += layerSize
  176. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  177. // Stop allocating on GPU(s) once we hit the users target NumGPU
  178. continue
  179. }
  180. // distribute the layers across the GPU(s) that have space
  181. for j := len(gpusWithSpace); j > 0; j-- {
  182. g := gpusWithSpace[i%j]
  183. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  184. if (g.g.FreeMemory - overhead) > used+layerSize {
  185. gpuAllocations[g.i] += layerSize
  186. layerCounts[g.i]++
  187. layerCount++
  188. break
  189. } else {
  190. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  191. }
  192. }
  193. }
  194. if layerCount >= int(ggml.KV().BlockCount()) {
  195. fullyLoaded = true
  196. } else {
  197. for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
  198. overflow += layerSize
  199. }
  200. }
  201. // Determine if we need to consider output then find where it fits
  202. if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  203. for j := len(gpusWithSpace); j > 0; j-- {
  204. g := gpusWithSpace[layerCount%j]
  205. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  206. if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
  207. gpuAllocations[g.i] += memoryLayerOutput
  208. layerCounts[g.i]++
  209. layerCount++
  210. break
  211. }
  212. }
  213. if layerCount < int(ggml.KV().BlockCount())+1 {
  214. fullyLoaded = false
  215. overflow += memoryLayerOutput
  216. }
  217. }
  218. // Add the applicable (full or partial) graph allocations
  219. for i := range gpus {
  220. if layerCounts[i] <= 0 {
  221. continue
  222. }
  223. if fullyLoaded {
  224. gpuAllocations[i] += graphFullOffload
  225. } else {
  226. gpuAllocations[i] += graphPartialOffload
  227. }
  228. }
  229. if fullyLoaded {
  230. graphOffload = graphFullOffload
  231. } else {
  232. graphOffload = graphPartialOffload
  233. }
  234. // Summaries for the log
  235. var memoryRequiredPartial, memoryRequiredTotal uint64
  236. for i := range gpuAllocations {
  237. memoryRequiredPartial += gpuAllocations[i]
  238. }
  239. memoryRequiredTotal = memoryRequiredPartial + overflow
  240. tensorSplit := ""
  241. if len(gpus) > 1 {
  242. splits := make([]string, len(gpus))
  243. for i, count := range layerCounts {
  244. splits[i] = strconv.Itoa(count)
  245. }
  246. tensorSplit = strings.Join(splits, ",")
  247. }
  248. allocationsList := []string{}
  249. for _, a := range gpuAllocations {
  250. allocationsList = append(allocationsList, format.HumanBytes2(a))
  251. }
  252. estimate := MemoryEstimate{
  253. TotalSize: memoryRequiredTotal,
  254. Layers: 0,
  255. Graph: 0,
  256. VRAMSize: 0,
  257. GPUSizes: []uint64{},
  258. inferenceLibrary: gpus[0].Library,
  259. layersRequested: opts.NumGPU,
  260. layersModel: int(ggml.KV().BlockCount()) + 1,
  261. availableList: availableList,
  262. kv: kv,
  263. allocationsList: allocationsList,
  264. memoryWeights: memoryWeights,
  265. memoryLayerOutput: memoryLayerOutput,
  266. graphFullOffload: graphFullOffload,
  267. graphPartialOffload: graphPartialOffload,
  268. projectorWeights: projectorWeights,
  269. projectorGraph: projectorGraph,
  270. }
  271. if gpus[0].Library == "cpu" {
  272. return estimate
  273. }
  274. if layerCount == 0 {
  275. slog.Debug("insufficient VRAM to load any model layers")
  276. return estimate
  277. }
  278. estimate.Layers = layerCount
  279. estimate.Graph = graphOffload
  280. estimate.VRAMSize = memoryRequiredPartial
  281. estimate.TotalSize = memoryRequiredTotal
  282. estimate.TensorSplit = tensorSplit
  283. estimate.GPUSizes = gpuAllocations
  284. return estimate
  285. }
  286. func (m MemoryEstimate) log() {
  287. overhead := envconfig.GpuOverhead()
  288. log := slog.With()
  289. if m.projectorWeights > 0 {
  290. log = log.With(
  291. slog.Group(
  292. "projector",
  293. "weights", format.HumanBytes2(m.projectorWeights),
  294. "graph", format.HumanBytes2(m.projectorGraph),
  295. ),
  296. )
  297. }
  298. log.Info(
  299. "offload to "+m.inferenceLibrary,
  300. slog.Group(
  301. "layers",
  302. // requested number of layers to offload
  303. "requested", m.layersRequested,
  304. // The number of layers the model has (including output)
  305. "model", m.layersModel,
  306. // estimated number of layers that can be offloaded
  307. "offload", m.Layers,
  308. // multi-gpu split for tensors
  309. "split", m.TensorSplit,
  310. ),
  311. slog.Group(
  312. "memory",
  313. // memory available by GPU for offloading
  314. "available", m.availableList,
  315. "gpu_overhead", format.HumanBytes2(overhead),
  316. slog.Group(
  317. "required",
  318. // memory required for full offloading
  319. "full", format.HumanBytes2(m.TotalSize),
  320. // memory required to offload layers.estimate layers
  321. "partial", format.HumanBytes2(m.VRAMSize),
  322. // memory of KV cache
  323. "kv", format.HumanBytes2(m.kv),
  324. // Allocations across the GPUs
  325. "allocations", m.allocationsList,
  326. ),
  327. slog.Group(
  328. "weights",
  329. // memory of the weights
  330. "total", format.HumanBytes2(m.memoryWeights),
  331. // memory of repeating layers
  332. "repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
  333. // memory of non-repeating layers
  334. "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
  335. ),
  336. slog.Group(
  337. "graph",
  338. // memory of graph when fully offloaded
  339. "full", format.HumanBytes2(m.graphFullOffload),
  340. // memory of graph when not fully offloaded
  341. "partial", format.HumanBytes2(m.graphPartialOffload),
  342. ),
  343. ),
  344. )
  345. }
  346. func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
  347. file, err := os.Open(filename)
  348. if err != nil {
  349. return 0, 0
  350. }
  351. defer file.Close()
  352. ggml, _, err := DecodeGGML(file, 0)
  353. if err != nil {
  354. return 0, 0
  355. }
  356. for _, layer := range ggml.Tensors().Layers() {
  357. weights += layer.size()
  358. }
  359. switch arch := ggml.KV().Architecture(); arch {
  360. case "mllama":
  361. kv := func(n string) uint64 {
  362. if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
  363. return uint64(v)
  364. }
  365. return 0
  366. }
  367. imageSize := kv("image_size")
  368. maxNumTiles := kv("max_num_tiles")
  369. embeddingLength := kv("embedding_length")
  370. headCount := kv("attention.head_count")
  371. numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
  372. if _, ok := ggml.Tensors().Layers()["v"]["class_embd"]; ok {
  373. numPatches++
  374. }
  375. numPaddedPatches := numPatches + 8 - (numPatches%8)%8
  376. graphSize = 4 * (8 +
  377. imageSize*imageSize*kv("num_channels")*maxNumTiles +
  378. embeddingLength*numPatches*maxNumTiles +
  379. 9*embeddingLength*numPaddedPatches*maxNumTiles +
  380. numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
  381. }
  382. return weights, graphSize
  383. }