Browse Source

Fix CPU performance on hyperthreaded systems

The default thread count logic was broken and resulted in 2x the number
of threads as it should on a hyperthreading CPU
resulting in thrashing and poor performance.
Daniel Hiltgen 1 year ago
parent
commit
325d74985b

+ 1 - 6
llm/ext_server.go

@@ -37,7 +37,6 @@ import (
 	"fmt"
 	"fmt"
 	"log"
 	"log"
 	"os"
 	"os"
-	"runtime"
 	"strings"
 	"strings"
 	"sync"
 	"sync"
 	"time"
 	"time"
@@ -185,11 +184,7 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
 		sparams.mmproj = nil
 		sparams.mmproj = nil
 	}
 	}
 
 
-	if opts.NumThread > 0 {
-		sparams.n_threads = C.uint(opts.NumThread)
-	} else {
-		sparams.n_threads = C.uint(runtime.NumCPU())
-	}
+	sparams.n_threads = C.uint(opts.NumThread)
 
 
 	log.Printf("Initializing internal llama server")
 	log.Printf("Initializing internal llama server")
 	resp := newExtServerResp(128)
 	resp := newExtServerResp(128)

+ 8 - 6
llm/llama.cpp/patches/0001-Expose-callable-API-for-server.patch

@@ -1,4 +1,4 @@
-From b5e195803e2a989e57eef0010adce778df1e2d01 Mon Sep 17 00:00:00 2001
+From 7184ae16e8fd0e9e91cac4c81daa323057fa992b Mon Sep 17 00:00:00 2001
 From: Daniel Hiltgen <daniel@ollama.com>
 From: Daniel Hiltgen <daniel@ollama.com>
 Date: Mon, 13 Nov 2023 12:25:58 -0800
 Date: Mon, 13 Nov 2023 12:25:58 -0800
 Subject: [PATCH] Expose callable API for server
 Subject: [PATCH] Expose callable API for server
@@ -6,10 +6,10 @@ Subject: [PATCH] Expose callable API for server
 This adds an extern "C" interface within the example server
 This adds an extern "C" interface within the example server
 ---
 ---
  examples/server/CMakeLists.txt |  24 +++
  examples/server/CMakeLists.txt |  24 +++
- examples/server/server.cpp     | 274 +++++++++++++++++++++++++++++++++
+ examples/server/server.cpp     | 276 +++++++++++++++++++++++++++++++++
  examples/server/server.h       |  89 +++++++++++
  examples/server/server.h       |  89 +++++++++++
  ggml-cuda.cu                   |   1 +
  ggml-cuda.cu                   |   1 +
- 4 files changed, 388 insertions(+)
+ 4 files changed, 390 insertions(+)
  create mode 100644 examples/server/server.h
  create mode 100644 examples/server/server.h
 
 
 diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
 diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -46,7 +46,7 @@ index 859cd12..4ea47a7 100644
 +endif()
 +endif()
 \ No newline at end of file
 \ No newline at end of file
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
 diff --git a/examples/server/server.cpp b/examples/server/server.cpp
-index 0403853..2084fd8 100644
+index 0403853..065420c 100644
 --- a/examples/server/server.cpp
 --- a/examples/server/server.cpp
 +++ b/examples/server/server.cpp
 +++ b/examples/server/server.cpp
 @@ -5,6 +5,9 @@
 @@ -5,6 +5,9 @@
@@ -67,7 +67,7 @@ index 0403853..2084fd8 100644
  int main(int argc, char **argv)
  int main(int argc, char **argv)
  {
  {
  #if SERVER_VERBOSE != 1
  #if SERVER_VERBOSE != 1
-@@ -3123,3 +3127,273 @@ int main(int argc, char **argv)
+@@ -3123,3 +3127,275 @@ int main(int argc, char **argv)
      llama_backend_free();
      llama_backend_free();
      return 0;
      return 0;
  }
  }
@@ -89,7 +89,9 @@ index 0403853..2084fd8 100644
 +        gpt_params params;
 +        gpt_params params;
 +        params.n_ctx = sparams->n_ctx;
 +        params.n_ctx = sparams->n_ctx;
 +        params.n_batch = sparams->n_batch;
 +        params.n_batch = sparams->n_batch;
-+        params.n_threads = sparams->n_threads;
++        if (sparams->n_threads > 0) {
++            params.n_threads = sparams->n_threads;
++        }
 +        params.n_parallel = sparams->n_parallel;
 +        params.n_parallel = sparams->n_parallel;
 +        params.rope_freq_base = sparams->rope_freq_base;
 +        params.rope_freq_base = sparams->rope_freq_base;
 +        params.rope_freq_scale = sparams->rope_freq_scale;
 +        params.rope_freq_scale = sparams->rope_freq_scale;