123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457 |
- package llm
- import (
- "fmt"
- "log/slog"
- "os"
- "strconv"
- "strings"
- "github.com/ollama/ollama/api"
- "github.com/ollama/ollama/discover"
- "github.com/ollama/ollama/envconfig"
- "github.com/ollama/ollama/format"
- "github.com/ollama/ollama/fs/ggml"
- )
- // This algorithm looks for a complete fit to determine if we need to unload other models
- func PredictServerFit(allGpus discover.GpuInfoList, f *ggml.GGML, adapters, projectors []string, opts api.Options, numParallel int) (bool, uint64) {
- // Split up the GPUs by type and try them
- var estimatedVRAM uint64
- for _, gpus := range allGpus.ByLibrary() {
- var layerCount int
- estimate := EstimateGPULayers(gpus, f, projectors, opts, numParallel)
- layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
- if opts.NumGPU < 0 {
- if layerCount > 0 && layerCount >= int(f.KV().BlockCount()+1) {
- return true, estimatedVRAM
- }
- } else {
- if layerCount > 0 && layerCount >= opts.NumGPU {
- return true, estimatedVRAM
- }
- }
- }
- return false, estimatedVRAM
- }
- type MemoryEstimate struct {
- // How many layers we predict we can load
- Layers int
- // The size of the graph which occupies the main GPU
- Graph uint64
- // How much VRAM will be allocated given the number of layers we predict
- VRAMSize uint64
- // The total size of the model if loaded into VRAM. If all layers are loaded, VRAMSize == TotalSize
- TotalSize uint64
- // For multi-GPU scenarios, this provides the tensor split parameter
- TensorSplit string
- // For multi-GPU scenarios, this is the size in bytes per GPU
- GPUSizes []uint64
- // internal fields for logging purposes
- inferenceLibrary string
- layersRequested int
- layersModel int
- availableList []string
- kv uint64
- allocationsList []string
- memoryWeights uint64
- memoryLayerOutput uint64
- graphFullOffload uint64
- graphPartialOffload uint64
- projectorWeights, projectorGraph uint64
- }
- // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
- // The GPUs provided must all be the same Library
- func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []string, opts api.Options, numParallel int) MemoryEstimate {
- // Graph size for a partial offload, applies to all GPUs
- var graphPartialOffload uint64
- // Graph size when all layers are offloaded, applies to all GPUs
- var graphFullOffload uint64
- // Final graph offload once we know full or partial
- var graphOffload uint64
- // Projectors loaded into GPU0 only
- var projectorWeights uint64
- var projectorGraph uint64
- // Conditional output size on GPU 0
- var memoryLayerOutput uint64
- // The sizes of a layer
- var layerSize uint64
- // The sum of all the layer sizes (just for logging)
- var memoryWeights uint64
- // True if all the layers are loaded
- var fullyLoaded bool
- // Overflow that didn't fit into the GPU
- var overflow uint64
- overhead := envconfig.GpuOverhead()
- availableList := make([]string, len(gpus))
- for i, gpu := range gpus {
- availableList[i] = format.HumanBytes2(gpu.FreeMemory)
- }
- slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
- for _, projector := range projectors {
- weight, graph := projectorMemoryRequirements(projector)
- projectorWeights += weight
- projectorGraph += graph
- // multimodal models require at least 2048 context
- opts.NumCtx = max(opts.NumCtx, 2048)
- }
- if projectorWeights == 0 && projectorGraph == 0 {
- projectorWeights, projectorGraph = f.VisionGraphSize()
- }
- layers := f.Tensors().GroupLayers()
- // add one layer worth of memory as a buffer
- if blk0, ok := layers["blk.0"]; ok {
- layerSize = blk0.Size()
- } else {
- slog.Warn("model missing blk.0 layer size")
- }
- var kvct string
- if envconfig.FlashAttention() &&
- discover.GetGPUInfo().FlashAttentionSupported() &&
- f.SupportsFlashAttention() {
- requested := strings.ToLower(envconfig.KvCacheType())
- if requested != "" && f.SupportsKVCacheType(requested) {
- kvct = requested
- }
- }
- kv, graphPartialOffload, graphFullOffload := f.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)), numParallel, kvct)
- if len(kv) > 0 {
- layerSize += kv[0]
- }
- var kvTotal uint64
- for _, kvLayer := range kv {
- kvTotal += kvLayer
- }
- if graphPartialOffload == 0 {
- graphPartialOffload = f.KV().GQA() * kvTotal / 6
- }
- if graphFullOffload == 0 {
- graphFullOffload = graphPartialOffload
- }
- // on metal there's no partial offload overhead
- if gpus[0].Library == "metal" {
- graphPartialOffload = graphFullOffload
- } else if len(gpus) > 1 {
- // multigpu should always use the partial graph size
- graphFullOffload = graphPartialOffload
- }
- if layer, ok := layers["output_norm"]; ok {
- memoryLayerOutput += layer.Size()
- }
- if layer, ok := layers["output"]; ok {
- memoryLayerOutput += layer.Size()
- } else if layer, ok := layers["token_embd"]; ok {
- memoryLayerOutput += layer.Size()
- }
- // Output layer handled at the end if we have space
- gpuZeroOverhead := projectorWeights + projectorGraph
- // Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
- var layerCount int
- layerCounts := make([]int, len(gpus))
- gpuAllocations := make([]uint64, len(gpus))
- type gs struct {
- i int
- g *discover.GpuInfo
- }
- gpusWithSpace := []gs{}
- for i := range gpus {
- var gzo uint64
- if len(gpusWithSpace) == 0 {
- gzo = gpuZeroOverhead
- }
- // Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
- if gpus[i].FreeMemory < overhead+gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
- slog.Debug("gpu has too little memory to allocate any layers",
- "id", gpus[i].ID,
- "library", gpus[i].Library,
- "variant", gpus[i].Variant,
- "compute", gpus[i].Compute,
- "driver", fmt.Sprintf("%d.%d", gpus[i].DriverMajor, gpus[i].DriverMinor),
- "name", gpus[i].Name,
- "total", format.HumanBytes2(gpus[i].TotalMemory),
- "available", format.HumanBytes2(gpus[i].FreeMemory),
- "minimum_memory", gpus[i].MinimumMemory,
- "layer_size", format.HumanBytes2(layerSize),
- "gpu_zer_overhead", format.HumanBytes2(gzo),
- "partial_offload", format.HumanBytes2(graphPartialOffload),
- "full_offload", format.HumanBytes2(graphFullOffload),
- )
- continue
- }
- gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
- gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
- }
- var gpuZeroID int
- if len(gpusWithSpace) > 0 {
- gpuZeroID = gpusWithSpace[0].i
- gpuAllocations[gpuZeroID] += gpuZeroOverhead
- }
- // For all the layers, find where they can fit on the GPU(s)
- for i := range int(f.KV().BlockCount()) {
- // Some models have inconsistent layer sizes
- if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
- layerSize = blk.Size()
- layerSize += kv[i]
- memoryWeights += blk.Size()
- }
- if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
- // Stop allocating on GPU(s) once we hit the users target NumGPU
- continue
- }
- // distribute the layers across the GPU(s) that have space
- for j := len(gpusWithSpace); j > 0; j-- {
- g := gpusWithSpace[i%j]
- used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
- if g.g.FreeMemory > overhead+used+layerSize {
- gpuAllocations[g.i] += layerSize
- layerCounts[g.i]++
- layerCount++
- break
- } else {
- gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
- }
- }
- }
- if layerCount >= int(f.KV().BlockCount()) {
- fullyLoaded = true
- } else {
- for i := layerCount; i < int(f.KV().BlockCount()); i++ {
- overflow += layerSize
- }
- }
- // Determine if we need to consider output then find where it fits
- if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
- for j := len(gpusWithSpace); j > 0; j-- {
- g := gpusWithSpace[layerCount%j]
- used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
- if g.g.FreeMemory > overhead+used+memoryLayerOutput {
- gpuAllocations[g.i] += memoryLayerOutput
- layerCounts[g.i]++
- layerCount++
- break
- }
- }
- if layerCount < int(f.KV().BlockCount())+1 {
- fullyLoaded = false
- overflow += memoryLayerOutput
- }
- }
- // Add the applicable (full or partial) graph allocations
- for i := range gpus {
- if layerCounts[i] <= 0 {
- continue
- }
- if fullyLoaded {
- gpuAllocations[i] += graphFullOffload
- } else {
- gpuAllocations[i] += graphPartialOffload
- }
- }
- if fullyLoaded {
- graphOffload = graphFullOffload
- } else {
- graphOffload = graphPartialOffload
- }
- // Summaries for the log
- var memoryRequiredPartial, memoryRequiredTotal uint64
- for i := range gpuAllocations {
- memoryRequiredPartial += gpuAllocations[i]
- }
- memoryRequiredTotal = memoryRequiredPartial + overflow
- tensorSplit := ""
- if len(gpus) > 1 {
- splits := make([]string, len(gpus))
- for i, count := range layerCounts {
- splits[i] = strconv.Itoa(count)
- }
- tensorSplit = strings.Join(splits, ",")
- }
- allocationsList := []string{}
- for _, a := range gpuAllocations {
- allocationsList = append(allocationsList, format.HumanBytes2(a))
- }
- estimate := MemoryEstimate{
- TotalSize: memoryRequiredTotal,
- Layers: 0,
- Graph: 0,
- VRAMSize: 0,
- GPUSizes: []uint64{},
- inferenceLibrary: gpus[0].Library,
- layersRequested: opts.NumGPU,
- layersModel: int(f.KV().BlockCount()) + 1,
- availableList: availableList,
- kv: kvTotal,
- allocationsList: allocationsList,
- memoryWeights: memoryWeights,
- memoryLayerOutput: memoryLayerOutput,
- graphFullOffload: graphFullOffload,
- graphPartialOffload: graphPartialOffload,
- projectorWeights: projectorWeights,
- projectorGraph: projectorGraph,
- }
- if gpus[0].Library == "cpu" {
- return estimate
- }
- if layerCount == 0 {
- slog.Debug("insufficient VRAM to load any model layers")
- return estimate
- }
- estimate.Layers = layerCount
- estimate.Graph = graphOffload
- estimate.VRAMSize = memoryRequiredPartial
- estimate.TotalSize = memoryRequiredTotal
- estimate.TensorSplit = tensorSplit
- estimate.GPUSizes = gpuAllocations
- return estimate
- }
- func (m MemoryEstimate) LogValue() slog.Value {
- attrs := []slog.Attr{
- slog.String("library", m.inferenceLibrary),
- slog.Group(
- "layers",
- // requested number of layers to offload
- "requested", m.layersRequested,
- // The number of layers the model has (including output)
- "model", m.layersModel,
- // estimated number of layers that can be offloaded
- "offload", m.Layers,
- // multi-gpu split for tensors
- "split", m.TensorSplit,
- ),
- slog.Group(
- "memory",
- // memory available by GPU for offloading
- "available", m.availableList,
- "gpu_overhead", format.HumanBytes2(envconfig.GpuOverhead()),
- slog.Group(
- "required",
- // memory required for full offloading
- "full", format.HumanBytes2(m.TotalSize),
- // memory required to offload layers.estimate layers
- "partial", format.HumanBytes2(m.VRAMSize),
- // memory of KV cache
- "kv", format.HumanBytes2(m.kv),
- // Allocations across the GPUs
- "allocations", m.allocationsList,
- ),
- slog.Group(
- "weights",
- // memory of the weights
- "total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
- // memory of repeating layers
- "repeating", format.HumanBytes2(m.memoryWeights),
- // memory of non-repeating layers
- "nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
- ),
- slog.Group(
- "graph",
- // memory of graph when fully offloaded
- "full", format.HumanBytes2(m.graphFullOffload),
- // memory of graph when not fully offloaded
- "partial", format.HumanBytes2(m.graphPartialOffload),
- ),
- ),
- }
- if m.projectorWeights > 0 {
- attrs = append(attrs, slog.Group(
- "projector",
- "weights", format.HumanBytes2(m.projectorWeights),
- "graph", format.HumanBytes2(m.projectorGraph),
- ))
- }
- return slog.GroupValue(attrs...)
- }
- func projectorMemoryRequirements(filename string) (weights, graphSize uint64) {
- file, err := os.Open(filename)
- if err != nil {
- return 0, 0
- }
- defer file.Close()
- ggml, _, err := ggml.Decode(file, 0)
- if err != nil {
- return 0, 0
- }
- for _, layer := range ggml.Tensors().GroupLayers() {
- weights += layer.Size()
- }
- switch arch := ggml.KV().Architecture(); arch {
- case "mllama":
- kv := func(n string) uint64 {
- if v, ok := ggml.KV()[arch+".vision."+n].(uint32); ok {
- return uint64(v)
- }
- return 0
- }
- imageSize := kv("image_size")
- maxNumTiles := kv("max_num_tiles")
- embeddingLength := kv("embedding_length")
- headCount := kv("attention.head_count")
- numPatches := (imageSize / kv("patch_size")) * (imageSize / kv("patch_size"))
- if _, ok := ggml.Tensors().GroupLayers()["v"]["class_embd"]; ok {
- numPatches++
- }
- numPaddedPatches := numPatches + 8 - (numPatches%8)%8
- graphSize = 4 * (8 +
- imageSize*imageSize*kv("num_channels")*maxNumTiles +
- embeddingLength*numPatches*maxNumTiles +
- 9*embeddingLength*numPaddedPatches*maxNumTiles +
- numPaddedPatches*maxNumTiles*numPaddedPatches*maxNumTiles*headCount)
- }
- return weights, graphSize
- }
|