1 year ago · c942e4a07b
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2726,7 +2726,7 @@ static json format_detokenized_response(std::string content)
 
				 static void log_server_request(const httplib::Request &req, const httplib::Response &res)
			
 
				 {
			
 
				     // skip GH copilot requests when using default port
			
 
				-    if (req.path == "/v1/health" || req.path == "/v1/completions")
			
 
				+    if (req.path == "/health" || req.path == "/v1/health" || req.path == "/v1/completions")
			
 
				     {
			
 
				         return;
			
 
				     }
			
@@ -3053,6 +3053,26 @@ int main(int argc, char **argv) {
 
				         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
			
 
				     }
			
 
				 
			
 
				+    if (sparams.n_threads_http < 1) {
			
 
				+        // +2 threads for monitoring endpoints
			
 
				+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
			
 
				+    }
			
 
				+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
			
 
				+    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
			
 
				+
			
 
				+    LOG_INFO("HTTP server listening", log_data);
			
 
				+    // run the HTTP server in a thread - see comment below
			
 
				+    std::thread t([&]()
			
 
				+            {
			
 
				+                if (!svr.listen_after_bind())
			
 
				+                {
			
 
				+                    state.store(SERVER_STATE_ERROR);
			
 
				+                    return 1;
			
 
				+                }
			
 
				+
			
 
				+                return 0;
			
 
				+            });
			
 
				+
			
 
				     // load the model
			
 
				     if (!llama.load_model(params))
			
 
				     {
			
@@ -3257,26 +3277,6 @@ int main(int argc, char **argv) {
 
				     }*/
			
 
				     //);
			
 
				 
			
 
				-    if (sparams.n_threads_http < 1) {
			
 
				-        // +2 threads for monitoring endpoints
			
 
				-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
			
 
				-    }
			
 
				-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
			
 
				-    svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
			
 
				-
			
 
				-    LOG_INFO("HTTP server listening", log_data);
			
 
				-    // run the HTTP server in a thread - see comment below
			
 
				-    std::thread t([&]()
			
 
				-            {
			
 
				-                if (!svr.listen_after_bind())
			
 
				-                {
			
 
				-                    state.store(SERVER_STATE_ERROR);
			
 
				-                    return 1;
			
 
				-                }
			
 
				-
			
 
				-                return 0;
			
 
				-            });
			
 
				-
			
 
				     llama.queue_tasks.on_new_task(std::bind(
			
 
				         &llama_server_context::process_single_task, &llama, std::placeholders::_1));
			
 
				     llama.queue_tasks.on_finish_multitask(std::bind(