memory.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. package llm
  2. import (
  3. "fmt"
  4. "log/slog"
  5. "os"
  6. "strconv"
  7. "strings"
  8. "github.com/ollama/ollama/api"
  9. "github.com/ollama/ollama/discover"
  10. "github.com/ollama/ollama/envconfig"
  11. "github.com/ollama/ollama/format"
  12. "github.com/ollama/ollama/fs/ggml"
  13. )
  14. // This algorithm looks for a complete fit to determine if we need to unload other models
  15. func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
  16. // Split up the GPUs by type and try them
  17. var estimatedVRAM uint64
  18. for _, gpus := range allGpus.ByLibrary() {
  19. var layerCount int
  20. estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
  21. layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
  22. if opts.NumGPU < 0 {
  23. if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
  24. return true, estimatedVRAM
  25. }
  26. } else {
  27. if layerCount > 0 && layerCount >= opts.NumGPU {
  28. return true, estimatedVRAM
  29. }
  30. }
  31. }
  32. return false, estimatedVRAM
  33. }
  34. type MemoryEstimate struct {
  35. // How many layers we predict we can load
  36. Layers int
  37. // The size of the graph which occupies the main GPU
  38. Graph uint64
  39. // How much VRAM will be allocated given the number of layers we predict
  40. VRAMSize uint64
  41. // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
  42. TotalSize uint64
  43. // For multi-GPU scenarios, this provides the tensor split parameter
  44. TensorSplit string
  45. // For multi-GPU scenarios, this is the size in bytes per GPU
  46. GPUSizes []uint64
  47. // internal fields for logging purposes
  48. inferenceLibrary string
  49. layersRequested int
  50. layersModel int
  51. availableList []string
  52. kv uint64
  53. allocationsList []string
  54. memoryWeights uint64
  55. memoryLayerOutput uint64
  56. graphFullOffload uint64
  57. graphPartialOffload uint64
  58. projectorWeights, projectorGraph uint64
  59. }
  60. // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
  61. // The GPUs provided must all be the same Library
  62. func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
  63. // Graph size for a partial offload, applies to all GPUs
  64. var graphPartialOffload uint64
  65. // Graph size when all layers are offloaded, applies to all GPUs
  66. var graphFullOffload uint64
  67. // Final graph offload once we know full or partial
  68. var graphOffload uint64
  69. // Projectors loaded into GPU0 only
  70. var projectorWeights uint64
  71. var projectorGraph uint64
  72. // Conditional output size on GPU 0
  73. var memoryLayerOutput uint64
  74. // The sizes of a layer
  75. var layerSize uint64
  76. // The sum of all the layer sizes (just for logging)
  77. var memoryWeights uint64
  78. // True if all the layers are loaded
  79. var fullyLoaded bool
  80. // Overflow that didn't fit into the GPU
  81. var overflow uint64
  82. overhead := envconfig.GpuOverhead()
  83. availableList := make([]string, len(gpus))
  84. for i, gpu := range gpus {
  85. availableList[i] = format.HumanBytes2(gpu.FreeMemory)
  86. }
  87. slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
  88. for _, projector := range projectors {
  89. weight, graph := projectorMemoryRequirements(projector)
  90. projectorWeights += weight
  91. projectorGraph += graph
  92. // multimodal models require at least 2048 context
  93. opts.NumCtx = max(opts.NumCtx, 2048)
  94. }
  95. if projectorWeights == 0 && projectorGraph == 0 {
  96. projectorWeights, projectorGraph = f.VisionGraphSize()
  97. }
  98. layers := f.Tensors().GroupLayers()
  99. // add one layer worth of memory as a buffer
  100. if blk0, ok := layers["blk.0"]; ok {
  101. layerSize = blk0.Size()
  102. } else {
  103. slog.Warn("model missing blk.0 layer size")
  104. }
  105. var kvct string
  106. if envconfig.FlashAttention() &&
  107. discover.GetGPUInfo().FlashAttentionSupported() &&
  108. f.SupportsFlashAttention() {
  109. requested := strings.ToLower(envconfig.KvCacheType())
  110. if requested != "" && f.SupportsKVCacheType(requested) {
  111. kvct = requested
  112. }
  113. }
  114. kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)
  115. if len(kv) > 0 {
  116. layerSize += kv[0]
  117. }
  118. var kvTotal uint64
  119. for _, kvLayer := range kv {
  120. kvTotal += kvLayer
  121. }
  122. if graphPartialOffload == 0 {
  123. graphPartialOffload = f.KV().GQA() * kvTotal / 6
  124. }
  125. if graphFullOffload == 0 {
  126. graphFullOffload = graphPartialOffload
  127. }
  128. // on metal there's no partial offload overhead
  129. if gpus[0].Library == "metal" {
  130. graphPartialOffload = graphFullOffload
  131. } else if len(gpus) > 1 {
  132. // multigpu should always use the partial graph size
  133. graphFullOffload = graphPartialOffload
  134. }
  135. if layer, ok := layers["output_norm"]; ok {
  136. memoryLayerOutput += layer.Size()
  137. }
  138. if layer, ok := layers["output"]; ok {
  139. memoryLayerOutput += layer.Size()
  140. } else if layer, ok := layers["token_embd"]; ok {
  141. memoryLayerOutput += layer.Size()
  142. }
  143. // Output layer handled at the end if we have space
  144. gpuZeroOverhead := projectorWeights + projectorGraph
  145. // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
  146. var layerCount int
  147. layerCounts := make([]int, len(gpus))
  148. gpuAllocations := make([]uint64, len(gpus))
  149. type gs struct {
  150. i int
  151. g *discover.GpuInfo
  152. }
  153. gpusWithSpace := []gs{}
  154. for i := range gpus {
  155. var gzo uint64
  156. if len(gpusWithSpace) == 0 {
  157. gzo = gpuZeroOverhead
  158. }
  159. // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
  160. if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
  161. slog.Debug("gpu has too little memory to allocate any layers",
  162. "id", gpus[i].ID,
  163. "library", gpus[i].Library,
  164. "variant", gpus[i].Variant,
  165. "compute", gpus[i].Compute,
  166. "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
  167. "name", gpus[i].Name,
  168. "total", format.HumanBytes2(gpus[i].TotalMemory),
  169. "available", format.HumanBytes2(gpus[i].FreeMemory),
  170. "minimum_memory", gpus[i].MinimumMemory,
  171. "layer_size", format.HumanBytes2(layerSize),
  172. "gpu_zer_overhead", format.HumanBytes2(gzo),
  173. "partial_offload", format.HumanBytes2(graphPartialOffload),
  174. "full_offload", format.HumanBytes2(graphFullOffload),
  175. )
  176. continue
  177. }
  178. gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
  179. gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
  180. }
  181. var gpuZeroID int
  182. if len(gpusWithSpace) > 0 {
  183. gpuZeroID = gpusWithSpace[0].i
  184. gpuAllocations[gpuZeroID] += gpuZeroOverhead
  185. }
  186. // For all the layers, find where they can fit on the GPU(s)
  187. for i := range int(f.KV().BlockCount()) {
  188. // Some models have inconsistent layer sizes
  189. if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
  190. layerSize = blk.Size()
  191. layerSize += kv[i]
  192. memoryWeights += blk.Size()
  193. }
  194. if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
  195. // Stop allocating on GPU(s) once we hit the users target NumGPU
  196. continue
  197. }
  198. // distribute the layers across the GPU(s) that have space
  199. for j := len(gpusWithSpace); j > 0; j-- {
  200. g := gpusWithSpace[i%j]
  201. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  202. if g.g.FreeMemory > overhead+used+layerSize {
  203. gpuAllocations[g.i] += layerSize
  204. layerCounts[g.i]++
  205. layerCount++
  206. break
  207. } else {
  208. gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
  209. }
  210. }
  211. }
  212. if layerCount >= int(f.KV().BlockCount()) {
  213. fullyLoaded = true
  214. } else {
  215. for i := layerCount; i < int(f.KV().BlockCount()); i++ {
  216. overflow += layerSize
  217. }
  218. }
  219. // Determine if we need to consider output then find where it fits
  220. if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
  221. for j := len(gpusWithSpace); j > 0; j-- {
  222. g := gpusWithSpace[layerCount%j]
  223. used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
  224. if g.g.FreeMemory > overhead+used+memoryLayerOutput {
  225. gpuAllocations[g.i] += memoryLayerOutput
  226. layerCounts[g.i]++
  227. layerCount++
  228. break
  229. }
  230. }
  231. if layerCount < int(f.KV().BlockCount())+1 {
  232. fullyLoaded = false
  233. overflow += memoryLayerOutput
  234. }
  235. }
  236. // Add the applicable (full or partial) graph allocations
  237. for i := range gpus {
  238. if layerCounts[i] <= 0 {
  239. continue
  240. }
  241. if fullyLoaded {
  242. gpuAllocations[i] += graphFullOffload
  243. } else {
  244. gpuAllocations[i] += graphPartialOffload
  245. }
  246. }
  247. if fullyLoaded {
  248. graphOffload = graphFullOffload
  249. } else {
  250. graphOffload = graphPartialOffload
  251. }
  252. // Summaries for the log
  253. var memoryRequiredPartial, memoryRequiredTotal uint64
  254. for i := range gpuAllocations {
  255. memoryRequiredPartial += gpuAllocations[i]
  256. }
  257. memoryRequiredTotal = memoryRequiredPartial + overflow
  258. tensorSplit := ""
  259. if len(gpus) > 1 {
  260. splits := make([]string, len(gpus))
  261. for i, count := range layerCounts {
  262. splits[i] = strconv.Itoa(count)
  263. }
  264. tensorSplit = strings.Join(splits, ",")
  265. }
  266. allocationsList := []string{}
  267. for _, a := range gpuAllocations {
  268. allocationsList = append(allocationsList, format.HumanBytes2(a))
  269. }
  270. estimate := MemoryEstimate{
  271. TotalSize: memoryRequiredTotal,
  272. Layers: 0,
  273. Graph: 0,
  274. VRAMSize: 0,
  275. GPUSizes: []uint64{},
  276. inferenceLibrary: gpus[0].Library,
  277. layersRequested: opts.NumGPU,
  278. layersModel: int(f.KV().BlockCount()) + 1,
  279. availableList: availableList,
  280. kv: kvTotal,
  281. allocationsList: allocationsList,
  282. memoryWeights: memoryWeights,
  283. memoryLayerOutput: memoryLayerOutput,
  284. graphFullOffload: graphFullOffload,
  285. graphPartialOffload: graphPartialOffload,
  286. projectorWeights: projectorWeights,
  287. projectorGraph: projectorGraph,
  288. }
  289. if gpus[0].Library == "cpu" {
  290. return estimate
  291. }
  292. if layerCount == 0 {
  293. slog.Debug("insufficient VRAM to load any model layers")
  294. return estimate
  295. }
  296. estimate.Layers = layerCount
  297. estimate.Graph = graphOffload
  298. estimate.VRAMSize = memoryRequiredPartial
  299. estimate.TotalSize = memoryRequiredTotal
  300. estimate.TensorSplit = tensorSplit
  301. estimate.GPUSizes = gpuAllocations
  302. return estimate
  303. }
  304. func (m MemoryEstimate) LogValue() slog.Value {
  305. attrs := []slog.Attr{
  306. slog.String("library", m.inferenceLibrary),
  307. slog.Group(
  308. "layers",
  309. // requested number of layers to offload
  310. "requested", m.layersRequested,
  311. // The number of layers the model has (including output)
  312. "model", m.layersModel,
  313. // estimated number of layers that can be offloaded
  314. "offload", m.Layers,
  315. // multi-gpu split for tensors
  316. "split", m.TensorSplit,
  317. ),
  318. slog.Group(
  319. "memory",
  320. // memory available by GPU for offloading
  321. "available", m.availableList,
  322. "gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
  323. slog.Group(
  324. "required",
  325. // memory required for full offloading
  326. "full", format.HumanBytes2(m.TotalSize),
  327. // memory required to offload layers.estimate layers
  328. "partial", format.HumanBytes2(m.VRAMSize),
  329. // memory of KV cache
  330. "kv", format.HumanBytes2(m.kv),
  331. // Allocations across the GPUs
  332. "allocations", m.allocationsList,
  333. ),
  334. slog.Group(
  335. "weights",
  336. // memory of the weights
  337. "total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
  338. // memory of repeating layers
  339. "repeating", format.HumanBytes2(m.memoryWeights),
  340. // memory of non-repeating layers
  341. "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
  342. ),
  343. slog.Group(
  344. "graph",
  345. // memory of graph when fully offloaded
  346. "full", format.HumanBytes2(m.graphFullOffload),
  347. // memory of graph when not fully offloaded
  348. "partial", format.HumanBytes2(m.graphPartialOffload),
  349. ),
  350. ),
  351. }
  352. if m.projectorWeights > 0 {
  353. attrs = append(attrs, slog.Group(
  354. "projector",
  355. "weights", format.HumanBytes2(m.projectorWeights),
  356. "graph", format.HumanBytes2(m.projectorGraph),
  357. ))
  358. }
  359. return slog.GroupValue(attrs...)
  360. }
  361. func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
  362. file, err := os.Open(filename)
  363. if err != nil {
  364. return 0, 0
  365. }
  366. defer file.Close()
  367. ggml, _, err := ggml.Decode(file, 0)
  368. if err != nil {
  369. return 0, 0
  370. }
  371. for _, layer := range ggml.Tensors().GroupLayers() {
  372. weights += layer.Size()
  373. }
  374. switch arch := ggml.KV().Architecture(); arch {
  375. case "mllama":
  376. kv := func(n string) uint64 {
  377. if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
  378. return uint64(v)
  379. }
  380. return 0
  381. }
  382. imageSize := kv("image_size")
  383. maxNumTiles := kv("max_num_tiles")
  384. embeddingLength := kv("embedding_length")
  385. headCount := kv("attention.head_count")
  386. numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
  387. if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
  388. numPatches++
  389. }
  390. numPaddedPatches := numPatches + 8 - (numPatches%8)%8
  391. graphSize = 4 * (8 +
  392. imageSize*imageSize*kv("num_channels")*maxNumTiles +
  393. embeddingLength*numPatches*maxNumTiles +
  394. 9*embeddingLength*numPaddedPatches*maxNumTiles +
  395. numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
  396. }
  397. return weights, graphSize
  398. }