6 meses atrás · a909417602
--- a/api/types.go
+++ b/api/types.go
@@ -236,7 +236,7 @@ type Runner struct {
 
				 	NumGPU    int   `json:"num_gpu,omitempty"`
			
 
				 	MainGPU   int   `json:"main_gpu,omitempty"`
			
 
				 	LowVRAM   bool  `json:"low_vram,omitempty"`
			
 
				-	F16KV     bool  `json:"f16_kv,omitempty"`
			
 
				+	F16KV     bool  `json:"f16_kv,omitempty"` // Deprecated: This option is ignored
			
 
				 	LogitsAll bool  `json:"logits_all,omitempty"`
			
 
				 	VocabOnly bool  `json:"vocab_only,omitempty"`
			
 
				 	UseMMap   *bool `json:"use_mmap,omitempty"`
			
@@ -613,7 +613,6 @@ func DefaultOptions() Options {
 
				 			NumGPU:    -1, // -1 here indicates that NumGPU should be set dynamically
			
 
				 			NumThread: 0,  // let the runtime decide
			
 
				 			LowVRAM:   false,
			
 
				-			F16KV:     true,
			
 
				 			UseMLock:  false,
			
 
				 			UseMMap:   nil,
			
 
				 		},
			
--- a/docs/api.md
+++ b/docs/api.md
@@ -355,7 +355,6 @@ curl http://localhost:11434/api/generate -d '{
 
				     "num_gpu": 1,
			
 
				     "main_gpu": 0,
			
 
				     "low_vram": false,
			
 
				-    "f16_kv": true,
			
 
				     "vocab_only": false,
			
 
				     "use_mmap": true,
			
 
				     "use_mlock": false,
			
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -837,14 +837,8 @@ func main() {
 
				 	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
			
 
				 	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
			
 
				 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
			
 
				-	// Expose requirements as a JSON output to stdout
			
 
				 	requirements := flag.Bool("requirements", false, "print json requirement information")
			
 
				 
			
 
				-	// These are either ignored by llama.cpp or have no significance to us
			
 
				-	_ = flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
			
 
				-	_ = flag.Bool("log-disable", false, "disables logging to a file")
			
 
				-	_ = flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
			
 
				-
			
 
				 	flag.Parse()
			
 
				 	if *requirements {
			
 
				 		printRequirements(os.Stdout)
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -186,7 +186,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		"--model", model,
			
 
				 		"--ctx-size", strconv.Itoa(opts.NumCtx),
			
 
				 		"--batch-size", strconv.Itoa(opts.NumBatch),
			
 
				-		"--embedding",
			
 
				 	}
			
 
				 
			
 
				 	if opts.NumGPU >= 0 {
			
@@ -218,10 +217,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 
				 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
			
 
				 	}
			
 
				 
			
 
				-	if !opts.F16KV {
			
 
				-		params = append(params, "--memory-f32")
			
 
				-	}
			
 
				-
			
 
				 	flashAttnEnabled := envconfig.FlashAttention()
			
 
				 
			
 
				 	for _, g := range gpus {
			
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -440,7 +440,6 @@ func TestParseFileParameters(t *testing.T) {
 
				 		"num_gpu 1":                    {"num_gpu", "1"},
			
 
				 		"main_gpu 1":                   {"main_gpu", "1"},
			
 
				 		"low_vram true":                {"low_vram", "true"},
			
 
				-		"f16_kv true":                  {"f16_kv", "true"},
			
 
				 		"logits_all true":              {"logits_all", "true"},
			
 
				 		"vocab_only true":              {"vocab_only", "true"},
			
 
				 		"use_mmap true":                {"use_mmap", "true"},