5 月之前 · 539be43640
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -129,7 +129,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string,
 
				 
			
 
				 	var kvct string
			
 
				 	if fa {
			
 
				-		requested := envconfig.KvCacheType()
			
 
				+		requested := strings.ToLower(envconfig.KvCacheType())
			
 
				 		if requested != "" && ggml.SupportsKVCacheType(requested) {
			
 
				 			kvct = requested
			
 
				 		}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -225,7 +225,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		fa = false
			
 
				 	}
			
 
				 
			
 
				-	kvct := envconfig.KvCacheType()
			
 
				+	kvct := strings.ToLower(envconfig.KvCacheType())
			
 
				 
			
 
				 	if fa {
			
 
				 		slog.Info("enabling flash attention")