Browse Source

Fix llava models not working after first request (#4164)

* fix llava models not working after first request

* individual requests only for llava models
Jeffrey Morgan 1 năm trước cách đây
mục cha
commit
1b0e6c9c0e
2 tập tin đã thay đổi với 32 bổ sung1 xóa
  1. 24 0
      llm/patches/05-clip-fix.diff
  2. 8 1
      llm/server.go

+ 24 - 0
llm/patches/05-clip-fix.diff

@@ -0,0 +1,24 @@
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
+index e3c9bcd4..b43f892d 100644
+--- a/examples/llava/clip.cpp
++++ b/examples/llava/clip.cpp
+@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
+     struct ggml_tensor * embeddings = inp;
+     if (ctx->has_class_embedding) {
+         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
++    }
++    ggml_set_name(embeddings, "embeddings");
++    ggml_set_input(embeddings);
++
++    if (ctx->has_class_embedding) {
+         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+         embeddings = ggml_acc(ctx0, embeddings, inp,
+                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+     }
+-    ggml_set_name(embeddings, "embeddings");
+-    ggml_set_input(embeddings);
+-
+ 
+     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
+     ggml_set_name(positions, "positions");

+ 8 - 1
llm/server.go

@@ -194,8 +194,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--numa")
 		params = append(params, "--numa")
 	}
 	}
 
 
-	// "--cont-batching", // TODO - doesn't seem to have any noticeable perf change for multiple requests
 	numParallel := envconfig.NumParallel
 	numParallel := envconfig.NumParallel
+
+	// TODO (jmorganca): multimodal models don't support parallel yet
+	// see https://github.com/ollama/ollama/issues/4165
+	if len(projectors) > 0 {
+		numParallel = 1
+		slog.Warn("multimodal models don't support parallel requests yet")
+	}
+
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 
 
 	for i := 0; i < len(servers); i++ {
 	for i := 0; i < len(servers); i++ {