@@ -129,7 +129,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
var kvct string
if fa {
- requested := envconfig.KvCacheType()
+ requested := strings.ToLower(envconfig.KvCacheType())
if requested != "" && ggml.SupportsKVCacheType(requested) {
kvct = requested
}
@@ -225,7 +225,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
fa = false
- kvct := envconfig.KvCacheType()
+ kvct := strings.ToLower(envconfig.KvCacheType())
slog.Info("enabling flash attention")