8 月之前 · 90ca84172c
--- a/integration/embed_test.go
+++ b/integration/embed_test.go
@@ -70,8 +70,8 @@ func TestAllMiniLMEmbed(t *testing.T) {
 
				 		t.Fatalf("expected 0.010071031, got %.8f", res.Embeddings[0][0])
			
 
				 	}
			
 
				 
			
 
				-	if res.PromptEvalCount != 8 {
			
 
				-		t.Fatalf("expected 8 prompt tokens, got %d", res.PromptEvalCount)
			
 
				+	if res.PromptEvalCount != 6 {
			
 
				+		t.Fatalf("expected 6 prompt tokens, got %d", res.PromptEvalCount)
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -102,8 +102,8 @@ func TestAllMiniLMBatchEmbed(t *testing.T) {
 
				 		t.Fatalf("expected 0.010071031 and -0.009802706, got %.8f and %.8f", res.Embeddings[0][0], res.Embeddings[1][0])
			
 
				 	}
			
 
				 
			
 
				-	if res.PromptEvalCount != 16 {
			
 
				-		t.Fatalf("expected 16 prompt tokens, got %d", res.PromptEvalCount)
			
 
				+	if res.PromptEvalCount != 12 {
			
 
				+		t.Fatalf("expected 12 prompt tokens, got %d", res.PromptEvalCount)
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1429,7 +1429,13 @@ struct llama_server_context
 
				         switch (task.type)
			
 
				         {
			
 
				             case TASK_TYPE_COMPLETION: {
			
 
				-                server_slot *slot = prefix_slot(task.data["prompt"]);
			
 
				+                server_slot *slot = nullptr;
			
 
				+                if (task.embedding_mode) {
			
 
				+                    // Embedding seq_id (aka slot id) must always be <= token length, so always use slot 0
			
 
				+                    slot = slots[0].available() ? &slots[0] : nullptr;
			
 
				+                } else {
			
 
				+                    slot = prefix_slot(task.data["prompt"]);
			
 
				+                }
			
 
				                 if (slot == nullptr)
			
 
				                 {
			
 
				                     // if no slot is available, we defer this task for processing later
			
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
@@ -1,60 +0,0 @@
 
				-diff --git a/src/llama.cpp b/src/llama.cpp
			
 
				-index 721b8f4e..cfe7ac40 100644
			
 
				---- a/src/llama.cpp
			
 
				-+++ b/src/llama.cpp
			
 
				-@@ -8420,14 +8420,14 @@ struct llm_build_context {
			
 
				-     }
			
 
				- 
			
 
				-     struct ggml_tensor * build_inp_mean() {
			
 
				--        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
			
 
				-+        lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, cparams.n_seq_max);
			
 
				-         cb(lctx.inp_mean, "inp_mean", -1);
			
 
				-         ggml_set_input(lctx.inp_mean);
			
 
				-         return lctx.inp_mean;
			
 
				-     }
			
 
				- 
			
 
				-     struct ggml_tensor * build_inp_cls() {
			
 
				--        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
			
 
				-+        lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, cparams.n_seq_max);
			
 
				-         cb(lctx.inp_cls, "inp_cls", -1);
			
 
				-         ggml_set_input(lctx.inp_cls);
			
 
				-         return lctx.inp_cls;
			
 
				-@@ -13847,19 +13847,16 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
			
 
				-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
			
 
				- 
			
 
				-         float * data = (float *) lctx.inp_mean->data;
			
 
				--        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
			
 
				-+        memset(lctx.inp_mean->data, 0, n_tokens * cparams.n_seq_max * ggml_element_size(lctx.inp_mean));
			
 
				- 
			
 
				-         std::vector<uint64_t> sum(n_tokens, 0);
			
 
				-         for (int i = 0; i < n_tokens; ++i) {
			
 
				-             const llama_seq_id seq_id = batch.seq_id[i][0];
			
 
				--
			
 
				--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
			
 
				--
			
 
				-             sum[seq_id] += 1;
			
 
				-         }
			
 
				- 
			
 
				--        std::vector<float> div(n_tokens, 0.0f);
			
 
				--        for (int i = 0; i < n_tokens; ++i) {
			
 
				-+        std::vector<float> div(cparams.n_seq_max, 0.0f);
			
 
				-+        for (uint32_t i = 0; i < cparams.n_seq_max; ++i) {
			
 
				-             const uint64_t s = sum[i];
			
 
				-             if (s > 0) {
			
 
				-                 div[i] = 1.0f/float(s);
			
 
				-@@ -13879,14 +13876,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
			
 
				-         GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
			
 
				- 
			
 
				-         uint32_t * data = (uint32_t *) lctx.inp_cls->data;
			
 
				--        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
			
 
				-+        memset(lctx.inp_cls->data, 0, cparams.n_seq_max * ggml_element_size(lctx.inp_cls));
			
 
				- 
			
 
				-         for (int i = 0; i < n_tokens; ++i) {
			
 
				-             const llama_seq_id seq_id = batch.seq_id[i][0];
			
 
				-             const llama_pos    pos    = batch.pos[i];
			
 
				--
			
 
				--            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
			
 
				--
			
 
				-             if (pos == 0) {
			
 
				-                 data[seq_id] = i;
			
 
				-             }
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -193,6 +193,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 						break
			
 
				 					}
			
 
				 
			
 
				+					// Embedding models should always be loaded with parallel=1
			
 
				+					if pending.model.CheckCapabilities(CapabilityCompletion) != nil {
			
 
				+						numParallel = 1
			
 
				+					}
			
 
				+
			
 
				 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first
			
 
				 					if len(gpus) == 1 && gpus[0].Library == "cpu" {
			
 
				 						// simplifying assumption of defaultParallel when in CPU mode