|
@@ -214,15 +214,36 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|
|
params = append(params, "--threads", strconv.Itoa(defaultThreads))
|
|
|
}
|
|
|
|
|
|
- flashAttnEnabled := envconfig.FlashAttention()
|
|
|
+ fa := envconfig.FlashAttention()
|
|
|
+ if fa && !gpus.FlashAttentionSupported() {
|
|
|
+ slog.Warn("flash attention enabled but not supported by gpu")
|
|
|
+ fa = false
|
|
|
+ }
|
|
|
|
|
|
- for _, g := range gpus {
|
|
|
- // only cuda (compute capability 7+) and metal support flash attention
|
|
|
- if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
|
|
|
- flashAttnEnabled = false
|
|
|
+ if fa && !ggml.SupportsFlashAttention() {
|
|
|
+ slog.Warn("flash attention enabled but not supported by model")
|
|
|
+ fa = false
|
|
|
+ }
|
|
|
+
|
|
|
+ kvct := envconfig.KvCacheType()
|
|
|
+
|
|
|
+ if fa {
|
|
|
+ slog.Info("enabling flash attention")
|
|
|
+ params = append(params, "--flash-attn")
|
|
|
+
|
|
|
+ // Flash Attention also supports kv cache quantization
|
|
|
+ // Enable if the requested and kv cache type is supported by the model
|
|
|
+ if kvct != "" && ggml.SupportsKVCacheType(kvct) {
|
|
|
+ params = append(params, "--kv-cache-type", kvct)
|
|
|
+ } else {
|
|
|
+ slog.Warn("kv cache type not supported by model", "type", kvct)
|
|
|
}
|
|
|
+ } else if kvct != "" && kvct != "f16" {
|
|
|
+ slog.Warn("quantized kv cache requested but flash attention disabled", "type", kvct)
|
|
|
+ }
|
|
|
|
|
|
- // mmap has issues with partial offloading on metal
|
|
|
+ // mmap has issues with partial offloading on metal
|
|
|
+ for _, g := range gpus {
|
|
|
if g.Library == "metal" &&
|
|
|
uint64(opts.NumGPU) > 0 &&
|
|
|
uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
|
|
@@ -231,10 +252,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if flashAttnEnabled {
|
|
|
- params = append(params, "--flash-attn")
|
|
|
- }
|
|
|
-
|
|
|
// Windows CUDA should not use mmap for best performance
|
|
|
// Linux with a model larger than free space, mmap leads to thrashing
|
|
|
// For CPU loads we want the memory to be allocated, not FS cache
|