1 năm trước cách đây · 1b0e6c9c0e
--- a/llm/patches/05-clip-fix.diff
+++ b/llm/patches/05-clip-fix.diff
@@ -0,0 +1,24 @@
 
															+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
														
 
															+index e3c9bcd4..b43f892d 100644
														
 
															+--- a/examples/llava/clip.cpp
														
 
															++++ b/examples/llava/clip.cpp
														
 
															+@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
														
 
															+     struct ggml_tensor * embeddings = inp;
														
 
															+     if (ctx->has_class_embedding) {
														
 
															+         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
														
 
															++    }
														
 
															++    ggml_set_name(embeddings, "embeddings");
														
 
															++    ggml_set_input(embeddings);
														
 
															++
														
 
															++    if (ctx->has_class_embedding) {
														
 
															+         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
														
 
															+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
														
 
															+         embeddings = ggml_acc(ctx0, embeddings, inp,
														
 
															+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
														
 
															+     }
														
 
															+-    ggml_set_name(embeddings, "embeddings");
														
 
															+-    ggml_set_input(embeddings);
														
 
															+-
														
 
															+ 
														
 
															+     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
														
 
															+     ggml_set_name(positions, "positions");
														
--- a/llm/server.go
+++ b/llm/server.go
@@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 		params = append(params, "--numa")
														
 
															 	}
														
 
															-	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
														
 
															 	numParallel := envconfig.NumParallel
														
 
															+
														
 
															+	// TODO (jmorganca): multimodal models don't support parallel yet
														
 
															+	// see https://github.com/ollama/ollama/issues/4165
														
 
															+	if len(projectors) > 0 {
														
 
															+		numParallel = 1
														
 
															+		slog.Warn("multimodal models don't support parallel requests yet")
														
 
															+	}
														
 
															+
														
 
															 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
														
 
															 	for i := 0; i < len(servers); i++ {