|
@@ -2,7 +2,6 @@ package llm
|
|
|
|
|
|
import (
|
|
|
"context"
|
|
|
- "fmt"
|
|
|
"log"
|
|
|
"os"
|
|
|
"runtime"
|
|
@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
|
|
opts.NumCtx = 4
|
|
|
}
|
|
|
|
|
|
- fmt.Println("size", ggml.Size)
|
|
|
- fmt.Println("filetype", ggml.FileType())
|
|
|
- fmt.Println("architecture", ggml.ModelFamily())
|
|
|
- fmt.Println("type", ggml.ModelType())
|
|
|
- fmt.Println("name", ggml.Name())
|
|
|
- fmt.Println("embd", ggml.NumEmbed())
|
|
|
- fmt.Println("head", ggml.NumHead())
|
|
|
- fmt.Println("head_kv", ggml.NumHeadKv())
|
|
|
- fmt.Println("gqa", ggml.NumGQA())
|
|
|
-
|
|
|
- available, _ := gpu.CheckVRAM()
|
|
|
-
|
|
|
- // For now assume filesize = model size
|
|
|
- // TODO: use actual model size
|
|
|
- requiredModel := ggml.Size
|
|
|
+ vram, _ := gpu.CheckVRAM()
|
|
|
+ size := ggml.Size
|
|
|
|
|
|
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
|
|
|
- requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
|
|
+ kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
|
|
|
|
|
|
// this amount is the overhead + tensors in memory
|
|
|
// TODO: get this from the llama.cpp's graph calcluations instead of
|
|
|
// estimating it's 1/6 * kv_cache_size * num_gqa
|
|
|
- requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6
|
|
|
-
|
|
|
- requiredTotal := requiredModel + requiredKv + requiredAlloc
|
|
|
-
|
|
|
- log.Println("system memory bytes:", available)
|
|
|
- log.Println("required model bytes:", requiredModel)
|
|
|
- log.Println("required kv bytes:", requiredKv)
|
|
|
- log.Println("required alloc bytes:", requiredAlloc)
|
|
|
- log.Println("required total bytes:", requiredTotal)
|
|
|
+ graph := int64(ggml.NumGQA()) * kv / 6
|
|
|
|
|
|
info := gpu.GetGPUInfo()
|
|
|
library := info.Library
|
|
|
+ switch runtime.GOOS {
|
|
|
+ case "darwin":
|
|
|
+ if opts.NumGPU == 0 {
|
|
|
+ break
|
|
|
+ }
|
|
|
|
|
|
- if opts.NumGPU == -1 {
|
|
|
- // default to offloading all layers
|
|
|
- opts.NumGPU = int(ggml.NumLayers()) + 1
|
|
|
- }
|
|
|
+ if size+kv+graph > vram {
|
|
|
+ log.Println("not enough vram available, falling back to CPU only")
|
|
|
+ opts.NumGPU = 0
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ opts.NumGPU = 1
|
|
|
+ default:
|
|
|
+ if library == "cpu" || library == "default" {
|
|
|
+ log.Println("GPU not available, falling back to CPU")
|
|
|
+ opts.NumGPU = 0
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // don't use GPU at all if no layers are loaded
|
|
|
+ if opts.NumGPU == 0 {
|
|
|
+ library = "cpu"
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // user-defined GPU count
|
|
|
+ if opts.NumGPU != -1 {
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ // the "main" GPU needs the most memory and determines the limit
|
|
|
+ // of how many layers can be loaded. It needs to fit:
|
|
|
+ // 1. the full compute graph allocation for all devices (graph)
|
|
|
+ // 2. the proportional kv cache for all devices (kv * % layers)
|
|
|
+ // 3. the proportional model (size * % layers / # devices)
|
|
|
+ // This estimates the number of layers
|
|
|
+ maxlayers := int64(ggml.NumLayers()) + 1
|
|
|
+ devices := int64(info.DeviceCount)
|
|
|
+ avg := vram / devices
|
|
|
+ layers := maxlayers * (avg - graph) / (kv + size/devices)
|
|
|
+ if layers > maxlayers {
|
|
|
+ layers = maxlayers
|
|
|
+ }
|
|
|
|
|
|
- // decide how many layers to put on the GPU
|
|
|
- if opts.NumGPU > 0 {
|
|
|
- switch runtime.GOOS {
|
|
|
- case "darwin":
|
|
|
- if requiredTotal > available {
|
|
|
- log.Println("not enough vram available, falling back to CPU only")
|
|
|
- opts.NumGPU = 0
|
|
|
- }
|
|
|
- default:
|
|
|
- if library == "cpu" || library == "default" {
|
|
|
- opts.NumGPU = 0
|
|
|
- break
|
|
|
- }
|
|
|
-
|
|
|
- // alloc buffer and kv cache is allocated as a fixed amount on the main gpu
|
|
|
- // TODO: find the largest GPU and only reserve memory there
|
|
|
- avgAvailable := available / int64(info.DeviceCount)
|
|
|
- if requiredAlloc > avgAvailable {
|
|
|
- log.Printf("not enough vram available, falling back to CPU only")
|
|
|
- library = "cpu"
|
|
|
- opts.NumGPU = 0
|
|
|
- break
|
|
|
- }
|
|
|
-
|
|
|
- // we don't know which GPU will be used, so estimate
|
|
|
- // the scratch buffer space on all of them
|
|
|
- // TODO: allocate less layers to the GPU with the scratch buffer
|
|
|
- // and more to the others (based on their available memory)
|
|
|
- available -= requiredAlloc * int64(info.DeviceCount)
|
|
|
-
|
|
|
- // no offloading required
|
|
|
- if requiredModel+requiredKv <= available {
|
|
|
- break
|
|
|
- }
|
|
|
-
|
|
|
- // fill remaining vram with layers
|
|
|
- log.Println("splitting", available, "of available memory bytes into layers")
|
|
|
- bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
|
|
|
- log.Println("bytes per layer:", bytesPerLayer)
|
|
|
- layers := available / bytesPerLayer
|
|
|
- log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer))
|
|
|
- if layers < int64(opts.NumGPU) {
|
|
|
- opts.NumGPU = int(layers)
|
|
|
- }
|
|
|
+ // 1 + 2 must fit on the main gpu
|
|
|
+ min := graph + kv*layers/maxlayers
|
|
|
+ if layers <= 0 || min > avg {
|
|
|
+ log.Printf("not enough vram available, falling back to CPU only")
|
|
|
+ library = "cpu"
|
|
|
+ opts.NumGPU = 0
|
|
|
+ break
|
|
|
}
|
|
|
+
|
|
|
+ opts.NumGPU = int(layers)
|
|
|
}
|
|
|
|
|
|
- opts.NumGQA = 0
|
|
|
opts.RopeFrequencyBase = 0.0
|
|
|
opts.RopeFrequencyScale = 0.0
|
|
|
return newLlmServer(library, model, adapters, projectors, opts)
|