|
@@ -5,10 +5,11 @@ import (
|
|
|
"fmt"
|
|
|
"log/slog"
|
|
|
"os"
|
|
|
- "runtime"
|
|
|
"slices"
|
|
|
+ "strings"
|
|
|
|
|
|
"github.com/ollama/ollama/api"
|
|
|
+ "github.com/ollama/ollama/format"
|
|
|
"github.com/ollama/ollama/gpu"
|
|
|
)
|
|
|
|
|
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
|
|
|
"mamba",
|
|
|
}
|
|
|
|
|
|
-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
|
|
+func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
|
|
|
if _, err := os.Stat(model); err != nil {
|
|
|
return nil, err
|
|
|
}
|
|
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
|
|
}
|
|
|
defer f.Close()
|
|
|
|
|
|
- ggml, size, err := DecodeGGML(f)
|
|
|
+ ggml, _, err := DecodeGGML(f)
|
|
|
if err != nil {
|
|
|
return nil, err
|
|
|
}
|
|
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
|
|
|
opts.NumCtx = 4
|
|
|
}
|
|
|
|
|
|
- vram, _ := gpu.CheckVRAM()
|
|
|
+ availableMemory, _ := gpu.CheckVRAM()
|
|
|
+ info := gpu.GetGPUInfo()
|
|
|
+
|
|
|
+ usedMemory := info.MinimumMemory
|
|
|
+ for _, projector := range projectors {
|
|
|
+ usedMemory += projectorMemoryRequirements(projector)
|
|
|
+
|
|
|
+ // multimodal models require at least 2048 context
|
|
|
+ opts.NumCtx = max(opts.NumCtx, 2048)
|
|
|
+ }
|
|
|
|
|
|
- // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
|
|
- kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
|
|
|
+ // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
|
|
|
+ kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
|
|
|
|
|
|
// this amount is the overhead + tensors in memory
|
|
|
// TODO: get this from the llama.cpp's graph calculations instead of
|
|
|
// estimating it's 1/6 * kv_cache_size * num_gqa
|
|
|
graph := int64(ggml.KV().GQA()) * kv / 6
|
|
|
+ usedMemory += graph
|
|
|
|
|
|
- if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
|
|
- opts.NumGPU = 0
|
|
|
+ if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
|
|
|
+ info.Library = "cpu"
|
|
|
}
|
|
|
|
|
|
- info := gpu.GetGPUInfo()
|
|
|
- switch runtime.GOOS {
|
|
|
- case "darwin":
|
|
|
- if opts.NumGPU == 0 {
|
|
|
- break
|
|
|
- }
|
|
|
+ requiredMemory := usedMemory
|
|
|
|
|
|
- if size+kv+graph > vram {
|
|
|
- slog.Info("not enough vram available, setting num_gpu=0")
|
|
|
- opts.NumGPU = 0
|
|
|
- break
|
|
|
- }
|
|
|
+ var layers int
|
|
|
+ for i := 0; i < int(ggml.KV().BlockCount()); i++ {
|
|
|
+ layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
|
|
|
+ requiredMemory += layerMemory
|
|
|
|
|
|
- // TODO: implement layer splitting on macOS
|
|
|
- opts.NumGPU = 999
|
|
|
- default:
|
|
|
- if info.Library == "cpu" {
|
|
|
- slog.Info("GPU not available, falling back to CPU")
|
|
|
- opts.NumGPU = 0
|
|
|
- break
|
|
|
+ if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
|
|
|
+ usedMemory += layerMemory
|
|
|
+ layers++
|
|
|
}
|
|
|
+ }
|
|
|
|
|
|
- // don't use GPU at all if no layers are loaded
|
|
|
- if opts.NumGPU == 0 {
|
|
|
- info.Library = "cpu"
|
|
|
- info.Variant = gpu.GetCPUVariant()
|
|
|
- break
|
|
|
- }
|
|
|
+ memOutputLayer := ggml.LayerSize("output.")
|
|
|
+ requiredMemory += memOutputLayer
|
|
|
|
|
|
- // user-defined GPU count
|
|
|
- if opts.NumGPU != -1 {
|
|
|
- break
|
|
|
- }
|
|
|
+ // only offload output layer if all repeating layers are offloaded
|
|
|
+ if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
|
|
|
+ usedMemory += memOutputLayer
|
|
|
+ layers++
|
|
|
+ }
|
|
|
|
|
|
- // the "main" GPU needs the most memory and determines the limit
|
|
|
- // of how many layers can be loaded. It needs to fit:
|
|
|
- // 1. the full compute graph allocation for all devices (graph)
|
|
|
- // 2. the proportional kv cache for all devices (kv * % layers)
|
|
|
- // 3. the proportional model (size * % layers / # devices)
|
|
|
- // This estimates the number of layers
|
|
|
- maxlayers := int64(ggml.KV().BlockCount()) + 1
|
|
|
- devices := int64(info.DeviceCount)
|
|
|
- avg := vram / devices
|
|
|
- layers := maxlayers * (avg - graph) / (kv + size/devices)
|
|
|
- if layers > maxlayers {
|
|
|
- layers = maxlayers
|
|
|
- }
|
|
|
+ slog.Info(
|
|
|
+ "offload to gpu",
|
|
|
+ "layers", layers,
|
|
|
+ "required", format.HumanBytes2(requiredMemory),
|
|
|
+ "used", format.HumanBytes2(usedMemory),
|
|
|
+ "available", format.HumanBytes2(availableMemory),
|
|
|
+ "kv", format.HumanBytes2(kv),
|
|
|
+ "graph", format.HumanBytes2(graph),
|
|
|
+ )
|
|
|
+
|
|
|
+ if opts.NumGPU < 0 && info.Library != "cpu" {
|
|
|
+ opts.NumGPU = layers
|
|
|
+ }
|
|
|
|
|
|
- // 1 + 2 must fit on the main gpu
|
|
|
- min := graph + kv*layers/maxlayers
|
|
|
- if layers <= 0 || min > avg {
|
|
|
- slog.Info("not enough vram available, falling back to CPU only")
|
|
|
- info.Library = "cpu"
|
|
|
- info.Variant = gpu.GetCPUVariant()
|
|
|
- opts.NumGPU = 0
|
|
|
- break
|
|
|
- }
|
|
|
+ return newLlmServer(info, model, adapters, projectors, opts)
|
|
|
+}
|
|
|
|
|
|
- opts.NumGPU = int(layers)
|
|
|
+func projectorMemoryRequirements(filename string) int64 {
|
|
|
+ file, err := os.Open(filename)
|
|
|
+ if err != nil {
|
|
|
+ return 0
|
|
|
}
|
|
|
+ defer file.Close()
|
|
|
|
|
|
- opts.RopeFrequencyBase = 0.0
|
|
|
- opts.RopeFrequencyScale = 0.0
|
|
|
- return newLlmServer(info, model, adapters, projectors, opts)
|
|
|
+ ggml, _, err := DecodeGGML(file)
|
|
|
+ if err != nil {
|
|
|
+ return 0
|
|
|
+ }
|
|
|
+
|
|
|
+ prefixes := make(map[string]struct{})
|
|
|
+ for _, layer := range ggml.Tensors() {
|
|
|
+ parts := strings.Split(layer.Name, ".")
|
|
|
+ prefixes[strings.Join(parts[:2], ".")] = struct{}{}
|
|
|
+ }
|
|
|
+
|
|
|
+ var ask int64
|
|
|
+ for prefix := range prefixes {
|
|
|
+ ask += ggml.LayerSize(prefix)
|
|
|
+ }
|
|
|
+
|
|
|
+ return ask
|
|
|
}
|
|
|
|
|
|
// Give any native cgo implementations an opportunity to initialize
|
|
@@ -134,7 +144,7 @@ func Init() error {
|
|
|
return nativeInit()
|
|
|
}
|
|
|
|
|
|
-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
|
|
|
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
|
|
|
dynLibs := getDynLibs(gpuInfo)
|
|
|
|
|
|
// Check to see if the user has requested a specific library instead of auto-detecting
|