8 months ago · f443dd7b81
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -46,6 +46,10 @@
 
				 #include "ggml-cann.h"
			
 
				 #endif
			
 
				 
			
 
				+#ifdef GGML_USE_VULKAN
			
 
				+#include "ggml-vulkan.h"
			
 
				+#endif
			
 
				+
			
 
				 #define STB_IMAGE_IMPLEMENTATION
			
 
				 #include "stb_image.h"
			
 
				 
			
@@ -100,26 +104,28 @@ static std::string format(const char * fmt, ...) {
 
				 // key constants
			
 
				 //
			
 
				 
			
 
				-#define KEY_FTYPE          "general.file_type"
			
 
				-#define KEY_NAME           "general.name"
			
 
				-#define KEY_DESCRIPTION    "general.description"
			
 
				-#define KEY_HAS_TEXT_ENC   "clip.has_text_encoder"
			
 
				-#define KEY_HAS_VIS_ENC    "clip.has_vision_encoder"
			
 
				-#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
			
 
				-#define KEY_USE_GELU       "clip.use_gelu"
			
 
				-#define KEY_N_EMBD         "clip.%s.embedding_length"
			
 
				-#define KEY_N_FF           "clip.%s.feed_forward_length"
			
 
				-#define KEY_N_BLOCK        "clip.%s.block_count"
			
 
				-#define KEY_N_HEAD         "clip.%s.attention.head_count"
			
 
				-#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
			
 
				-#define KEY_PROJ_DIM       "clip.%s.projection_dim"
			
 
				-#define KEY_TOKENS         "tokenizer.ggml.tokens"
			
 
				-#define KEY_N_POSITIONS    "clip.text.context_length"
			
 
				-#define KEY_IMAGE_SIZE     "clip.vision.image_size"
			
 
				-#define KEY_PATCH_SIZE     "clip.vision.patch_size"
			
 
				-#define KEY_IMAGE_MEAN     "clip.vision.image_mean"
			
 
				-#define KEY_IMAGE_STD      "clip.vision.image_std"
			
 
				-#define KEY_PROJ_TYPE      "clip.projector_type"
			
 
				+#define KEY_FTYPE               "general.file_type"
			
 
				+#define KEY_NAME                "general.name"
			
 
				+#define KEY_DESCRIPTION         "general.description"
			
 
				+#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
			
 
				+#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
			
 
				+#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
			
 
				+#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
			
 
				+#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
			
 
				+#define KEY_USE_GELU            "clip.use_gelu"
			
 
				+#define KEY_N_EMBD              "clip.%s.embedding_length"
			
 
				+#define KEY_N_FF                "clip.%s.feed_forward_length"
			
 
				+#define KEY_N_BLOCK             "clip.%s.block_count"
			
 
				+#define KEY_N_HEAD              "clip.%s.attention.head_count"
			
 
				+#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
			
 
				+#define KEY_PROJ_DIM            "clip.%s.projection_dim"
			
 
				+#define KEY_TOKENS              "tokenizer.ggml.tokens"
			
 
				+#define KEY_N_POSITIONS         "clip.text.context_length"
			
 
				+#define KEY_IMAGE_SIZE          "clip.vision.image_size"
			
 
				+#define KEY_PATCH_SIZE          "clip.vision.patch_size"
			
 
				+#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
			
 
				+#define KEY_IMAGE_STD           "clip.vision.image_std"
			
 
				+#define KEY_PROJ_TYPE           "clip.projector_type"
			
 
				 
			
 
				 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
			
 
				 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
			
@@ -153,12 +159,20 @@ static std::string format(const char * fmt, ...) {
 
				 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
			
 
				 #define TN_IMAGE_NEWLINE   "model.image_newline"
			
 
				 
			
 
				+#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
			
 
				+#define TN_MINICPMV_QUERY "resampler.query"
			
 
				+#define TN_MINICPMV_PROJ "resampler.proj.weight"
			
 
				+#define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
			
 
				+#define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
			
 
				+#define TN_MINICPMV_LN "resampler.ln_%s.%s"
			
 
				+
			
 
				 
			
 
				 enum projector_type {
			
 
				     PROJECTOR_TYPE_MLP,
			
 
				     PROJECTOR_TYPE_MLP_NORM,
			
 
				     PROJECTOR_TYPE_LDP,
			
 
				     PROJECTOR_TYPE_LDPV2,
			
 
				+    PROJECTOR_TYPE_RESAMPLER,
			
 
				     PROJECTOR_TYPE_UNKNOWN,
			
 
				 };
			
 
				 
			
@@ -166,6 +180,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 
				     { PROJECTOR_TYPE_MLP, "mlp" },
			
 
				     { PROJECTOR_TYPE_LDP, "ldp" },
			
 
				     { PROJECTOR_TYPE_LDPV2, "ldpv2"},
			
 
				+    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
			
 
				 };
			
 
				 
			
 
				 
			
@@ -226,17 +241,20 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
 
				 }
			
 
				 
			
 
				 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
			
 
				-    std::string result;
			
 
				-    for (size_t pos = 0; ; pos += search.length()) {
			
 
				-        auto new_pos = s.find(search, pos);
			
 
				-        if (new_pos == std::string::npos) {
			
 
				-            result += s.substr(pos, s.size() - pos);
			
 
				-            break;
			
 
				-        }
			
 
				-        result += s.substr(pos, new_pos - pos) + replace;
			
 
				-        pos = new_pos;
			
 
				+    if (search.empty()) {
			
 
				+        return;
			
 
				+    }
			
 
				+    std::string builder;
			
 
				+    builder.reserve(s.length());
			
 
				+    size_t pos = 0;
			
 
				+    size_t last_pos = 0;
			
 
				+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
			
 
				+        builder.append(s, last_pos, pos - last_pos);
			
 
				+        builder.append(replace);
			
 
				+        last_pos = pos + search.length();
			
 
				     }
			
 
				-    s = std::move(result);
			
 
				+    builder.append(s, last_pos, std::string::npos);
			
 
				+    s = std::move(builder);
			
 
				 }
			
 
				 
			
 
				 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
			
@@ -518,12 +536,34 @@ struct clip_vision_model {
 
				     struct ggml_tensor * mm_model_mlp_2_b;
			
 
				     struct ggml_tensor * mm_model_peg_0_w;
			
 
				     struct ggml_tensor * mm_model_peg_0_b;
			
 
				+
			
 
				+    // MINICPMV projection
			
 
				+    struct ggml_tensor * mm_model_pos_embed_k;
			
 
				+    struct ggml_tensor * mm_model_query;
			
 
				+    struct ggml_tensor * mm_model_proj;
			
 
				+    struct ggml_tensor * mm_model_kv_proj;
			
 
				+    struct ggml_tensor * mm_model_attn_q_w;
			
 
				+    struct ggml_tensor * mm_model_attn_q_b;
			
 
				+    struct ggml_tensor * mm_model_attn_k_w;
			
 
				+    struct ggml_tensor * mm_model_attn_k_b;
			
 
				+    struct ggml_tensor * mm_model_attn_v_w;
			
 
				+    struct ggml_tensor * mm_model_attn_v_b;
			
 
				+    struct ggml_tensor * mm_model_attn_o_w;
			
 
				+    struct ggml_tensor * mm_model_attn_o_b;
			
 
				+    struct ggml_tensor * mm_model_ln_q_w;
			
 
				+    struct ggml_tensor * mm_model_ln_q_b;
			
 
				+    struct ggml_tensor * mm_model_ln_kv_w;
			
 
				+    struct ggml_tensor * mm_model_ln_kv_b;
			
 
				+    struct ggml_tensor * mm_model_ln_post_w;
			
 
				+    struct ggml_tensor * mm_model_ln_post_b;
			
 
				 };
			
 
				 
			
 
				 struct clip_ctx {
			
 
				     bool has_text_encoder    = false;
			
 
				     bool has_vision_encoder  = false;
			
 
				     bool has_llava_projector = false;
			
 
				+    bool has_minicpmv_projector = false;
			
 
				+    int minicpmv_version = 2;
			
 
				 
			
 
				     struct clip_vision_model vision_model;
			
 
				     projector_type proj_type = PROJECTOR_TYPE_MLP;
			
@@ -548,9 +588,11 @@ struct clip_ctx {
 
				 
			
 
				     ggml_backend_t backend       = NULL;
			
 
				     ggml_gallocr_t compute_alloc = NULL;
			
 
				+
			
 
				+    struct clip_image_size * load_image_size;
			
 
				 };
			
 
				 
			
 
				-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
			
 
				+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				         LOG_TEE("This gguf file seems to have no vision encoder\n");
			
 
				         return nullptr;
			
@@ -559,20 +601,33 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				     const auto & model = ctx->vision_model;
			
 
				     const auto & hparams = model.hparams;
			
 
				 
			
 
				-    const int image_size           = hparams.image_size;
			
 
				+    const int image_size = hparams.image_size;
			
 
				+    int image_size_width  = image_size;
			
 
				+    int image_size_height = image_size;
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        if (load_image_size == nullptr) {
			
 
				+            load_image_size = clip_image_size_init();
			
 
				+        }
			
 
				+        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
			
 
				+        image_size_width  = load_image_size->width;
			
 
				+        image_size_height = load_image_size->height;
			
 
				+        if (is_inf) {
			
 
				+            image_size_width  = imgs->data->nx;
			
 
				+            image_size_height = imgs->data->ny;
			
 
				+        }
			
 
				+    }
			
 
				     const int patch_size           = hparams.patch_size;
			
 
				-    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
			
 
				-    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
			
 
				+    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
			
 
				     const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
			
 
				     const int hidden_size          = hparams.hidden_size;
			
 
				     const int n_head               = hparams.n_head;
			
 
				     const int d_head               = hidden_size / n_head;
			
 
				-    const int n_layer              = hparams.n_layer;
			
 
				+    int n_layer                    = hparams.n_layer;
			
 
				     const float eps                = hparams.eps;
			
 
				 
			
 
				     const int batch_size = imgs->size;
			
 
				 
			
 
				-    if (ctx->has_llava_projector) {
			
 
				+    if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
			
 
				         GGML_ASSERT(batch_size == 1);
			
 
				     }
			
 
				 
			
@@ -585,7 +640,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				     struct ggml_context * ctx0 = ggml_init(params);
			
 
				     struct ggml_cgraph * gf = ggml_new_graph(ctx0);
			
 
				 
			
 
				-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
			
 
				+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
			
 
				     ggml_set_name(inp_raw, "inp_raw");
			
 
				     ggml_set_input(inp_raw);
			
 
				 
			
@@ -598,19 +653,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				         // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
			
 
				         inp = ggml_add(ctx0, inp, model.patch_bias);
			
 
				     }
			
 
				-
			
 
				-    // concat class_embeddings and patch_embeddings
			
 
				     struct ggml_tensor * embeddings = inp;
			
 
				-    if (ctx->has_class_embedding) {
			
 
				-        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
			
 
				-        ggml_set_name(embeddings, "embeddings");
			
 
				-        ggml_set_input(embeddings);
			
 
				-        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
			
 
				-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
			
 
				-        embeddings = ggml_acc(ctx0, embeddings, inp,
			
 
				-                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
			
 
				-    }
			
 
				+    struct ggml_tensor * pos_embed = nullptr;
			
 
				 
			
 
				+    if (ctx->has_llava_projector) {
			
 
				+        // concat class_embeddings and patch_embeddings
			
 
				+        if (ctx->has_class_embedding) {
			
 
				+            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
			
 
				+            ggml_set_name(embeddings, "embeddings");
			
 
				+            ggml_set_input(embeddings);
			
 
				+            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
			
 
				+                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
			
 
				+            embeddings = ggml_acc(ctx0, embeddings, inp,
			
 
				+                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
			
 
				+        }
			
 
				+    }
			
 
				 
			
 
				     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
			
 
				     ggml_set_name(positions, "positions");
			
@@ -619,6 +676,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				     embeddings =
			
 
				         ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
			
 
				 
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        int pos_w = image_size_width/patch_size;
			
 
				+        int pos_h = image_size_height/patch_size;
			
 
				+        if (ctx->minicpmv_version == 2) {
			
 
				+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
			
 
				+        }
			
 
				+        else if (ctx->minicpmv_version == 3) {
			
 
				+            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
			
 
				+        }
			
 
				+        ggml_set_name(pos_embed, "pos_embed");
			
 
				+        ggml_set_input(pos_embed);
			
 
				+    }
			
 
				+
			
 
				     // pre-layernorm
			
 
				     if (ctx->has_pre_norm) {
			
 
				         embeddings = ggml_norm(ctx0, embeddings, eps);
			
@@ -628,6 +698,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				     }
			
 
				 
			
 
				     // loop over layers
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        n_layer += 1;
			
 
				+    }
			
 
				     for (int il = 0; il < n_layer - 1; il++) {
			
 
				         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
			
 
				 
			
@@ -717,7 +790,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				     }
			
 
				 
			
 
				     // llava projector
			
 
				-    {
			
 
				+    if (ctx->has_llava_projector) {
			
 
				         embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
			
 
				 
			
 
				         struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
			
@@ -738,8 +811,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				             embeddings = ggml_gelu(ctx0, embeddings);
			
 
				             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
			
 
				             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
			
 
				-
			
 
				-        } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
			
 
				+        }
			
 
				+        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
			
 
				             embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
			
 
				             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
			
 
				             // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
			
@@ -898,6 +971,75 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				             GGML_ABORT("fatal error");
			
 
				         }
			
 
				     }
			
 
				+    // minicpmv projector
			
 
				+    else if (ctx->has_minicpmv_projector)
			
 
				+    {
			
 
				+        if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
			
 
				+            struct ggml_tensor * q = model.mm_model_query;
			
 
				+            { // layernorm
			
 
				+                q = ggml_norm(ctx0, q, eps);
			
 
				+                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
			
 
				+            }
			
 
				+            struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
			
 
				+            { // layernorm
			
 
				+                v = ggml_norm(ctx0, v, eps);
			
 
				+                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
			
 
				+            }
			
 
				+            struct ggml_tensor * k;
			
 
				+            { // position
			
 
				+                // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
			
 
				+                k = ggml_add(ctx0, v, pos_embed);
			
 
				+            }
			
 
				+
			
 
				+            { // attention
			
 
				+                int hidden_size = 4096;
			
 
				+                const int d_head = 128;
			
 
				+                int n_head = hidden_size/d_head;
			
 
				+                int num_query = 96;
			
 
				+                if (ctx->minicpmv_version == 2) {
			
 
				+                    hidden_size = 4096;
			
 
				+                    n_head = hidden_size/d_head;
			
 
				+                    num_query = 96;
			
 
				+                }
			
 
				+                else if (ctx->minicpmv_version == 3) {
			
 
				+                    hidden_size = 3584;
			
 
				+                    n_head = hidden_size/d_head;
			
 
				+                    num_query = 64;
			
 
				+                }
			
 
				+
			
 
				+                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
			
 
				+                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
			
 
				+                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
			
 
				+                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
			
 
				+                // permute
			
 
				+                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
			
 
				+                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
			
 
				+                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
			
 
				+                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
			
 
				+                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
			
 
				+                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
			
 
				+                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
			
 
				+                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
			
 
				+                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
			
 
				+                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
			
 
				+                KQ = ggml_soft_max_inplace(ctx0, KQ);
			
 
				+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
			
 
				+                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
			
 
				+                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
			
 
				+                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
			
 
				+
			
 
				+                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
			
 
				+            }
			
 
				+            { // layernorm
			
 
				+                embeddings = ggml_norm(ctx0, embeddings, eps);
			
 
				+                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
			
 
				+            }
			
 
				+            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
			
 
				+        }
			
 
				+        else {
			
 
				+            GGML_ASSERT(false);
			
 
				+        }
			
 
				+    }
			
 
				 
			
 
				     // build the graph
			
 
				     ggml_build_forward_expand(gf, embeddings);
			
@@ -1002,7 +1144,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    clip_ctx * new_clip = new clip_ctx;
			
 
				+    clip_ctx * new_clip = new clip_ctx{};
			
 
				 
			
 
				     // update projector type
			
 
				     {
			
@@ -1036,6 +1178,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				     LOG_TEE("%s: CLIP using CANN backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				+#ifdef GGML_USE_VULKAN
			
 
				+    new_clip->backend = ggml_backend_vk_init(0);
			
 
				+    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
			
 
				+#endif
			
 
				 
			
 
				     if (!new_clip->backend) {
			
 
				         new_clip->backend = ggml_backend_cpu_init();
			
@@ -1055,7 +1201,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
			
 
				         }
			
 
				 
			
 
				-        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
			
 
				+        idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
			
 
				+        if (idx != -1) {
			
 
				+            new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
			
 
				+        }
			
 
				+
			
 
				+        idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
			
 
				+        if (idx != -1) {
			
 
				+            new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
			
 
				+        }
			
 
				+
			
 
				+        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
			
 
				+
			
 
				         GGML_ASSERT(new_clip->has_vision_encoder);
			
 
				         GGML_ASSERT(!new_clip->has_text_encoder);
			
 
				 
			
@@ -1066,6 +1223,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
			
 
				             LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
			
 
				             LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
			
 
				+            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
			
 
				             LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
			
 
				             LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
			
 
				         }
			
@@ -1307,6 +1465,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
			
 
				             vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
			
 
				         }
			
 
				+        else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
			
 
				+            // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
			
 
				+            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
			
 
				+            vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
			
 
				+            vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
			
 
				+            vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
			
 
				+            vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
			
 
				+            vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
			
 
				+            vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
			
 
				+            vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
			
 
				+            vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
			
 
				+            vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
			
 
				+            vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
			
 
				+            vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
			
 
				+            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
			
 
				+            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
			
 
				+            vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
			
 
				+            vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
			
 
				+            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
			
 
				+            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
			
 
				+        }
			
 
				         else {
			
 
				             std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
			
 
				             throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
			
@@ -1345,7 +1524,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
			
 
				         clip_image_f32_batch batch;
			
 
				         batch.size = 1;
			
 
				-        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
			
 
				+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
			
 
				         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
			
 
				         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
			
 
				         LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
			
@@ -1354,6 +1533,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				     return new_clip;
			
 
				 }
			
 
				 
			
 
				+void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
			
 
				+    ctx_clip->load_image_size = load_image_size;
			
 
				+}
			
 
				+
			
 
				+struct clip_image_size * clip_image_size_init() {
			
 
				+    struct clip_image_size * load_image_size = new struct clip_image_size();
			
 
				+    load_image_size->width = 448;
			
 
				+    load_image_size->height = 448;
			
 
				+    return load_image_size;
			
 
				+}
			
 
				+
			
 
				 struct clip_image_u8 * clip_image_u8_init() {
			
 
				     return new clip_image_u8();
			
 
				 }
			
@@ -1459,7 +1649,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
 
				     }
			
 
				 }
			
 
				 
			
 
				-inline float clip(float x, float lower, float upper) {
			
 
				+inline int clip(int x, int lower, int upper) {
			
 
				     return std::max(lower, std::min(x, upper));
			
 
				 }
			
 
				 
			
@@ -1624,9 +1814,182 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
 
				     return patches;
			
 
				 }
			
 
				 
			
 
				+static int ensure_divide(int length, int patch_size) {
			
 
				+    return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
			
 
				+}
			
 
				+
			
 
				+static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
			
 
				+    int width = original_size.first;
			
 
				+    int height = original_size.second;
			
 
				+    if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
			
 
				+        float r = static_cast<float>(width) / height;
			
 
				+        height = static_cast<int>(scale_resolution / std::sqrt(r));
			
 
				+        width = static_cast<int>(height * r);
			
 
				+    }
			
 
				+    int best_width = ensure_divide(width, patch_size);
			
 
				+    int best_height = ensure_divide(height, patch_size);
			
 
				+    return std::make_pair(best_width, best_height);
			
 
				+}
			
 
				+
			
 
				+static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
			
 
				+    int width, height;
			
 
				+    std::tie(width, height) = original_size;
			
 
				+    int grid_x, grid_y;
			
 
				+    std::tie(grid_x, grid_y) = grid;
			
 
				+
			
 
				+    int refine_width = ensure_divide(width, grid_x);
			
 
				+    int refine_height = ensure_divide(height, grid_y);
			
 
				+
			
 
				+    int grid_width = refine_width / grid_x;
			
 
				+    int grid_height = refine_height / grid_y;
			
 
				+
			
 
				+   // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
			
 
				+    auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
			
 
				+    int best_grid_width, best_grid_height;
			
 
				+    std::tie(best_grid_width, best_grid_height) = best_grid_size;
			
 
				+
			
 
				+  //  std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
			
 
				+    std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
			
 
				+    return refine_size;
			
 
				+}
			
 
				+
			
 
				+static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
			
 
				+    std::vector<int> candidate_split_grids_nums;
			
 
				+    for (int i : {multiple - 1, multiple, multiple + 1}) {
			
 
				+        if (i == 1 || i > max_slice_nums) {
			
 
				+            continue;
			
 
				+        }
			
 
				+        candidate_split_grids_nums.push_back(i);
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::pair<int, int>> candidate_grids;
			
 
				+    for (int split_grids_nums : candidate_split_grids_nums) {
			
 
				+        int m = 1;
			
 
				+        while (m <= split_grids_nums) {
			
 
				+            if (split_grids_nums % m == 0) {
			
 
				+                candidate_grids.emplace_back(m, split_grids_nums / m);
			
 
				+            }
			
 
				+            ++m;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    std::pair<int, int> best_grid{1, 1};
			
 
				+    float min_error = std::numeric_limits<float>::infinity();
			
 
				+    for (const auto& grid : candidate_grids) {
			
 
				+        float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
			
 
				+        if (error < min_error) {
			
 
				+            best_grid = grid;
			
 
				+            min_error = error;
			
 
				+        }
			
 
				+    }
			
 
				+    return best_grid;
			
 
				+}
			
 
				+
			
 
				+// inspired from LLaVA-UHD:
			
 
				+//    -> https://arxiv.org/pdf/2403.11703
			
 
				+//    -> https://github.com/thunlp/LLaVA-UHD
			
 
				+//    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
			
 
				+static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
			
 
				+    const std::pair<int, int> original_size={img->nx,img->ny};
			
 
				+    const int original_width = img->nx;
			
 
				+    const int original_height = img->ny;
			
 
				+    const float log_ratio = log(1.0*original_width/original_height);
			
 
				+    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
			
 
				+    const int multiple = fmin(ceil(ratio), max_slice_nums);
			
 
				+
			
 
				+    std::vector<std::vector<clip_image_u8 *>> images;
			
 
				+    LOG_TEE("%s: multiple %d\n", __func__, multiple);
			
 
				+    images.push_back(std::vector<clip_image_u8 *>());
			
 
				+
			
 
				+    if (multiple <= 1) {
			
 
				+        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
			
 
				+        clip_image_u8 * source_image = clip_image_u8_init();
			
 
				+        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
			
 
				+        // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
			
 
				+        images[images.size()-1].push_back(source_image);
			
 
				+    }
			
 
				+    else if (multiple > 1) {
			
 
				+        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
			
 
				+        clip_image_u8 * source_image = clip_image_u8_init();
			
 
				+        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
			
 
				+        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
			
 
				+        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
			
 
				+        images[images.size()-1].push_back(source_image);
			
 
				+
			
 
				+        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
			
 
				+        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
			
 
				+
			
 
				+        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
			
 
				+        clip_image_u8 * refine_image = clip_image_u8_init();
			
 
				+        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
			
 
				+
			
 
				+        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
			
 
				+
			
 
				+        // split_to_patches
			
 
				+        int width = refine_image->nx;
			
 
				+        int height = refine_image->ny;
			
 
				+        int grid_x = int(width / best_grid.first);
			
 
				+        int grid_y = int(height / best_grid.second);
			
 
				+        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
			
 
				+            images.push_back(std::vector<clip_image_u8 *>());
			
 
				+            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
			
 
				+                clip_image_u8 * patch = clip_image_u8_init();
			
 
				+                patch->nx = grid_x;
			
 
				+                patch->ny = grid_y;
			
 
				+                patch->buf.resize(3 * patch->nx * patch->ny);
			
 
				+                for (int y = patches_i; y < patches_i + grid_y; ++y) {
			
 
				+                    for (int x = patches_j; x < patches_j + grid_x; ++x) {
			
 
				+                        const int i = 3 * (y * refine_image->nx + x);
			
 
				+                        const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
			
 
				+                        patch->buf[j]   = refine_image->buf[i];
			
 
				+                        patch->buf[j+1] = refine_image->buf[i+1];
			
 
				+                        patch->buf[j+2] = refine_image->buf[i+2];
			
 
				+                    }
			
 
				+                }
			
 
				+                images[images.size()-1].push_back(patch);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return images;
			
 
				+}
			
 
				+
			
 
				+int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
			
 
				+    const int max_slice_nums=9;
			
 
				+    const int scale_resolution=448;
			
 
				+    const int original_width = ctx_clip->load_image_size->width;
			
 
				+    const int original_height = ctx_clip->load_image_size->height;
			
 
				+    const float log_ratio = log(1.0*original_width/original_height);
			
 
				+    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
			
 
				+    const int multiple = fmin(ceil(ratio), max_slice_nums);
			
 
				+    std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
			
 
				+    return best_grid.first;
			
 
				+}
			
 
				+
			
 
				 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
			
 
				 // res_imgs memory is being allocated here, previous allocations will be freed if found
			
 
				 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
			
 
				+
			
 
				+    if(clip_is_minicpmv(ctx)){
			
 
				+        int max_slice_nums = 9;
			
 
				+        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
			
 
				+        res_imgs->size = 0;
			
 
				+        for (size_t i = 0; i < imgs.size(); ++i){
			
 
				+            res_imgs->size += imgs[i].size();
			
 
				+        }
			
 
				+        res_imgs->data = new clip_image_f32[res_imgs->size];
			
 
				+        int idx = 0;
			
 
				+        for (size_t i = 0; i < imgs.size(); ++i) {
			
 
				+            for (size_t j = 0; j < imgs[i].size(); ++j) {
			
 
				+                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
			
 
				+                clip_image_f32 * res = clip_image_f32_init();
			
 
				+                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
			
 
				+                res_imgs->data[idx++] = *res;
			
 
				+                clip_image_f32_free(res);
			
 
				+            }
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				     bool pad_to_square = true;
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				         LOG_TEE("This gguf file seems to have no vision encoder\n");
			
@@ -1842,11 +2205,104 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
				 
			
 
				     if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
			
 
				         n_patches /= 4;
			
 
				+    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
			
 
				+        if (ctx->minicpmv_version == 2) {
			
 
				+            n_patches = 96;
			
 
				+        }
			
 
				+        else if (ctx->minicpmv_version == 3) {
			
 
				+            n_patches = 64;
			
 
				+        }
			
 
				     }
			
 
				 
			
 
				     return n_patches;
			
 
				 }
			
 
				 
			
 
				+static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
			
 
				+    assert(embed_dim % 2 == 0);
			
 
				+    int H = pos.size();
			
 
				+    int W = pos[0].size();
			
 
				+
			
 
				+    std::vector<float> omega(embed_dim / 2);
			
 
				+    for (int i = 0; i < embed_dim / 2; ++i) {
			
 
				+        omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
			
 
				+    for (int h = 0; h < H; ++h) {
			
 
				+        for (int w = 0; w < W; ++w) {
			
 
				+            for (int d = 0; d < embed_dim / 2; ++d) {
			
 
				+                float out_value = pos[h][w] * omega[d];
			
 
				+                emb[h][w][d] = sin(out_value);
			
 
				+                emb[h][w][d + embed_dim / 2] = cos(out_value);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return emb;
			
 
				+}
			
 
				+
			
 
				+static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
			
 
				+    assert(embed_dim % 2 == 0);
			
 
				+    std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
			
 
				+    std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
			
 
				+
			
 
				+    int H = emb_h.size();
			
 
				+    int W = emb_h[0].size();
			
 
				+    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
			
 
				+
			
 
				+    for (int h = 0; h < H; ++h) {
			
 
				+        for (int w = 0; w < W; ++w) {
			
 
				+            for (int d = 0; d < embed_dim / 2; ++d) {
			
 
				+                emb[h][w][d] = emb_h[h][w][d];
			
 
				+                emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    return emb;
			
 
				+}
			
 
				+
			
 
				+static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
			
 
				+    int grid_h_size = image_size.first;
			
 
				+    int grid_w_size = image_size.second;
			
 
				+
			
 
				+    std::vector<float> grid_h(grid_h_size);
			
 
				+    std::vector<float> grid_w(grid_w_size);
			
 
				+
			
 
				+    for (int i = 0; i < grid_h_size; ++i) {
			
 
				+        grid_h[i] = static_cast<float>(i);
			
 
				+    }
			
 
				+    for (int i = 0; i < grid_w_size; ++i) {
			
 
				+        grid_w[i] = static_cast<float>(i);
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
			
 
				+    for (int h = 0; h < grid_h_size; ++h) {
			
 
				+        for (int w = 0; w < grid_w_size; ++w) {
			
 
				+            grid[h][w] = grid_w[w];
			
 
				+        }
			
 
				+    }
			
 
				+    std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
			
 
				+    for (int h = 0; h < grid_h_size; ++h) {
			
 
				+        for (int w = 0; w < grid_w_size; ++w) {
			
 
				+            grid_2d[0][h][w] = grid_h[h];
			
 
				+            grid_2d[1][h][w] = grid_w[w];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
			
 
				+
			
 
				+    int H = image_size.first;
			
 
				+    int W = image_size.second;
			
 
				+    std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
			
 
				+    for (int h = 0; h < H; ++h) {
			
 
				+        for (int w = 0; w < W; ++w) {
			
 
				+            pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return pos_embed_2d;
			
 
				+}
			
 
				+
			
 
				 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				         LOG_TEE("This gguf file seems to have no vision encoder\n");
			
@@ -1869,19 +2325,33 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				     if (ctx->has_llava_projector) {
			
 
				         GGML_ASSERT(batch_size == 1); // TODO: support multiple images
			
 
				     }
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        GGML_ASSERT(batch_size == 1);
			
 
				+    }
			
 
				 
			
 
				     // build the inference graph
			
 
				-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
			
 
				+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
			
 
				     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
			
 
				 
			
 
				     // set inputs
			
 
				     const auto & model = ctx->vision_model;
			
 
				     const auto & hparams = model.hparams;
			
 
				 
			
 
				-    const int image_size    = hparams.image_size;
			
 
				+    const int image_size = hparams.image_size;
			
 
				+    int image_size_width  = image_size;
			
 
				+    int image_size_height = image_size;
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        image_size_width  = imgs->data[0].nx;
			
 
				+        image_size_height = imgs->data[0].ny;
			
 
				+    }
			
 
				     const int patch_size    = hparams.patch_size;
			
 
				-    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
			
 
				+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
			
 
				     const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
			
 
				+    if(ctx->load_image_size==nullptr){
			
 
				+        ctx->load_image_size= clip_image_size_init();
			
 
				+    }
			
 
				+    const int pos_w = ctx->load_image_size->width/patch_size;
			
 
				+    const int pos_h = ctx->load_image_size->height/patch_size;
			
 
				 
			
 
				     {
			
 
				         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
			
@@ -1890,7 +2360,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				         for (size_t i = 0; i < imgs->size; i++) {
			
 
				             const int nx = imgs->data[i].nx;
			
 
				             const int ny = imgs->data[i].ny;
			
 
				-            GGML_ASSERT(nx == image_size && ny == image_size);
			
 
				+            if (!ctx->has_minicpmv_projector) {
			
 
				+                GGML_ASSERT(nx == image_size && ny == image_size);
			
 
				+            }
			
 
				 
			
 
				             const int n = nx * ny;
			
 
				 
			
@@ -1907,37 +2379,87 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
			
 
				         free(data);
			
 
				     }
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        {
			
 
				+            // inspired from siglip:
			
 
				+            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
			
 
				+            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
			
 
				+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
			
 
				+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
			
 
				+            int bucket_coords_h[70];
			
 
				+            int bucket_coords_w[70];
			
 
				+            for (int i = 0; i < pos_h; i++){
			
 
				+                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
			
 
				+            }
			
 
				+            for (int i = 0; i < pos_w; i++){
			
 
				+                bucket_coords_w[i] = std::floor(70.0*i/pos_w);
			
 
				+            }
			
 
				+            for (int i = 0, id = 0; i < pos_h; i++){
			
 
				+                for (int j = 0; j < pos_w; j++){
			
 
				+                    positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
			
 
				+                }
			
 
				+            }
			
 
				+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
			
 
				+            free(positions_data);
			
 
				+        }
			
 
				 
			
 
				-    {
			
 
				-        if (ctx->has_class_embedding) {
			
 
				-            struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
			
 
				+        {
			
 
				+            // inspired from resampler of Qwen-VL:
			
 
				+            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
			
 
				+            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
			
 
				+            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
			
 
				+            int embed_dim = 4096;
			
 
				+            if (ctx->minicpmv_version == 2) {
			
 
				+                embed_dim = 4096;
			
 
				+            }
			
 
				+            else if (ctx->minicpmv_version == 3) {
			
 
				+                embed_dim = 3584;
			
 
				+            }
			
 
				+            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
			
 
				 
			
 
				-            void* zero_mem = malloc(ggml_nbytes(embeddings));
			
 
				-            memset(zero_mem, 0, ggml_nbytes(embeddings));
			
 
				-            ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
			
 
				-            free(zero_mem);
			
 
				+            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
			
 
				+            for(int i=0;i<pos_w * pos_h;++i){
			
 
				+                for(int j=0;j<embed_dim;++j){
			
 
				+                    pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
			
 
				+            free(pos_embed_data);
			
 
				         }
			
 
				     }
			
 
				+    else{
			
 
				+        {
			
 
				+            if (ctx->has_class_embedding) {
			
 
				+                struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
			
 
				 
			
 
				-    {
			
 
				-        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
			
 
				+                void* zero_mem = malloc(ggml_nbytes(embeddings));
			
 
				+                memset(zero_mem, 0, ggml_nbytes(embeddings));
			
 
				+                ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
			
 
				+                free(zero_mem);
			
 
				+            }
			
 
				+        }
			
 
				 
			
 
				-        int* positions_data = (int*)malloc(ggml_nbytes(positions));
			
 
				-        for (int i = 0; i < num_positions; i++) {
			
 
				-            positions_data[i] = i;
			
 
				+        {
			
 
				+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
			
 
				+
			
 
				+            int* positions_data = (int*)malloc(ggml_nbytes(positions));
			
 
				+            for (int i = 0; i < num_positions; i++) {
			
 
				+                positions_data[i] = i;
			
 
				+            }
			
 
				+            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
			
 
				+            free(positions_data);
			
 
				         }
			
 
				-        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
			
 
				-        free(positions_data);
			
 
				-    }
			
 
				 
			
 
				-    {
			
 
				-        struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
			
 
				-        int* patches_data = (int*)malloc(ggml_nbytes(patches));
			
 
				-        for (int i = 0; i < num_patches; i++) {
			
 
				-            patches_data[i] = i + 1;
			
 
				+        {
			
 
				+            struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
			
 
				+            int* patches_data = (int*)malloc(ggml_nbytes(patches));
			
 
				+            for (int i = 0; i < num_patches; i++) {
			
 
				+                patches_data[i] = i + 1;
			
 
				+            }
			
 
				+            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
			
 
				+            free(patches_data);
			
 
				         }
			
 
				-        ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
			
 
				-        free(patches_data);
			
 
				     }
			
 
				 
			
 
				     if (ggml_backend_is_cpu(ctx->backend)) {
			
@@ -2107,7 +2629,22 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
 
				     if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
			
 
				         return ctx->vision_model.mm_3_b->ne[0];
			
 
				     }
			
 
				+    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
			
 
				+        if (ctx->minicpmv_version == 2) {
			
 
				+            return 4096;
			
 
				+        }
			
 
				+        else if (ctx->minicpmv_version == 3) {
			
 
				+            return 3584;
			
 
				+        }
			
 
				+    }
			
 
				 
			
 
				     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
			
 
				     throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
			
 
				 }
			
 
				+
			
 
				+int clip_is_minicpmv(const struct clip_ctx * ctx) {
			
 
				+    if (ctx->has_minicpmv_projector) {
			
 
				+        return ctx->minicpmv_version;
			
 
				+    }
			
 
				+    return 0;
			
 
				+}
			
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -44,14 +44,17 @@
 
				 #    define CLIP_API
			
 
				 #endif
			
 
				 
			
 
				-struct clip_ctx;
			
 
				-
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
 
				 
			
 
				 struct clip_ctx;
			
 
				 
			
 
				+struct clip_image_size {
			
 
				+    int width;
			
 
				+    int height;
			
 
				+};
			
 
				+
			
 
				 struct clip_image_u8_batch {
			
 
				     struct clip_image_u8 * data;
			
 
				     size_t size;
			
@@ -81,6 +84,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 
				 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
			
 
				 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
			
 
				 
			
 
				+CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
			
 
				+CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
			
 
				+
			
 
				+CLIP_API struct clip_image_size * clip_image_size_init();
			
 
				 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
			
 
				 CLIP_API struct clip_image_f32 * clip_image_f32_init();
			
 
				 
			
@@ -104,6 +111,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 
				 
			
 
				 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
			
 
				 
			
 
				+CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -103,6 +103,41 @@
 
				 
			
 
				 using json = nlohmann::ordered_json;
			
 
				 
			
 
				+//
			
 
				+// Environment variable utils
			
 
				+//
			
 
				+
			
 
				+template<typename T>
			
 
				+static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
			
 
				+get_env(std::string name, T & target) {
			
 
				+    char * value = std::getenv(name.c_str());
			
 
				+    target = value ? std::string(value) : target;
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
			
 
				+get_env(std::string name, T & target) {
			
 
				+    char * value = std::getenv(name.c_str());
			
 
				+    target = value ? std::stoi(value) : target;
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+static typename std::enable_if<std::is_floating_point<T>::value, void>::type
			
 
				+get_env(std::string name, T & target) {
			
 
				+    char * value = std::getenv(name.c_str());
			
 
				+    target = value ? std::stof(value) : target;
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+static typename std::enable_if<std::is_same<T, bool>::value, void>::type
			
 
				+get_env(std::string name, T & target) {
			
 
				+    char * value = std::getenv(name.c_str());
			
 
				+    if (value) {
			
 
				+        std::string val(value);
			
 
				+        target = val == "1" || val == "true";
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 //
			
 
				 // CPU utils
			
 
				 //
			
@@ -136,8 +171,34 @@ int32_t cpu_get_num_physical_cores() {
 
				     if (result == 0) {
			
 
				         return num_physical_cores;
			
 
				     }
			
 
				-#elif defined(_WIN32)
			
 
				-    //TODO: Implement
			
 
				+#elif defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
			
 
				+    // TODO: windows + arm64 + mingw64
			
 
				+    unsigned int n_threads_win = std::thread::hardware_concurrency();
			
 
				+    unsigned int default_threads = n_threads_win > 0 ? (n_threads_win <= 4 ? n_threads_win : n_threads_win / 2) : 4;
			
 
				+
			
 
				+    DWORD buffer_size = 0;
			
 
				+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &buffer_size)) {
			
 
				+        if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
			
 
				+            return default_threads;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    std::vector<char> buffer(buffer_size);
			
 
				+    if (!GetLogicalProcessorInformationEx(RelationProcessorCore, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data()), &buffer_size)) {
			
 
				+        return default_threads;
			
 
				+    }
			
 
				+
			
 
				+    int32_t num_physical_cores = 0;
			
 
				+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(buffer.data());
			
 
				+    while (buffer_size > 0) {
			
 
				+        if (info->Relationship == RelationProcessorCore) {
			
 
				+            num_physical_cores += info->Processor.GroupCount;
			
 
				+        }
			
 
				+        buffer_size -= info->Size;
			
 
				+        info = reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(reinterpret_cast<char*>(info) + info->Size);
			
 
				+    }
			
 
				+
			
 
				+    return num_physical_cores > 0 ? num_physical_cores : default_threads;
			
 
				 #endif
			
 
				     unsigned int n_threads = std::thread::hardware_concurrency();
			
 
				     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
			
@@ -216,16 +277,61 @@ int32_t cpu_get_num_math() {
 
				     return cpu_get_num_physical_cores();
			
 
				 }
			
 
				 
			
 
				-//
			
 
				-// CLI argument parsing
			
 
				-//
			
 
				+// Helper for setting process priority
			
 
				 
			
 
				-void gpt_params_handle_hf_token(gpt_params & params) {
			
 
				-    if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
			
 
				-        params.hf_token = std::getenv("HF_TOKEN");
			
 
				+#if defined(_WIN32)
			
 
				+
			
 
				+bool set_process_priority(enum ggml_sched_priority prio) {
			
 
				+    if (prio == GGML_SCHED_PRIO_NORMAL) {
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    DWORD p = NORMAL_PRIORITY_CLASS;
			
 
				+    switch (prio) {
			
 
				+        case GGML_SCHED_PRIO_NORMAL:   p = NORMAL_PRIORITY_CLASS;       break;
			
 
				+        case GGML_SCHED_PRIO_MEDIUM:   p = ABOVE_NORMAL_PRIORITY_CLASS; break;
			
 
				+        case GGML_SCHED_PRIO_HIGH:     p = HIGH_PRIORITY_CLASS;         break;
			
 
				+        case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS;     break;
			
 
				+    }
			
 
				+
			
 
				+    if (!SetPriorityClass(GetCurrentProcess(), p)) {
			
 
				+        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
			
 
				+        return false;
			
 
				     }
			
 
				+
			
 
				+    return true;
			
 
				 }
			
 
				 
			
 
				+#else // MacOS and POSIX
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/resource.h>
			
 
				+
			
 
				+bool set_process_priority(enum ggml_sched_priority prio) {
			
 
				+    if (prio == GGML_SCHED_PRIO_NORMAL) {
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    int p = 0;
			
 
				+    switch (prio) {
			
 
				+        case GGML_SCHED_PRIO_NORMAL:   p =  0;  break;
			
 
				+        case GGML_SCHED_PRIO_MEDIUM:   p = -5;  break;
			
 
				+        case GGML_SCHED_PRIO_HIGH:     p = -10; break;
			
 
				+        case GGML_SCHED_PRIO_REALTIME: p = -20; break;
			
 
				+    }
			
 
				+
			
 
				+    if (!setpriority(PRIO_PROCESS, 0, p)) {
			
 
				+        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
			
 
				+        return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+//
			
 
				+// CLI argument parsing
			
 
				+//
			
 
				+
			
 
				 void gpt_params_handle_model_default(gpt_params & params) {
			
 
				     if (!params.hf_repo.empty()) {
			
 
				         // short-hand to avoid specifying --hf-file -> default it to --model
			
@@ -248,6 +354,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
 
				     }
			
 
				 }
			
 
				 
			
 
				+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
			
 
				+    int32_t n_set = 0;
			
 
				+
			
 
				+    if (cpuparams.n_threads < 0) {
			
 
				+        // Assuming everything about cpuparams is invalid
			
 
				+        if (role_model != nullptr) {
			
 
				+            cpuparams = *role_model;
			
 
				+        } else {
			
 
				+            cpuparams.n_threads = cpu_get_num_math();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
			
 
				+        if (cpuparams.cpumask[i]) {
			
 
				+            n_set++;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (n_set && n_set < cpuparams.n_threads) {
			
 
				+        // Not enough set bits, may experience performance issues.
			
 
				+        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
			
 
				     bool invalid_param = false;
			
 
				     std::string arg;
			
@@ -267,13 +397,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 
				         }
			
 
				     }
			
 
				 
			
 
				+    postprocess_cpu_params(params.cpuparams, nullptr);
			
 
				+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
			
 
				+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
			
 
				+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
			
 
				+
			
 
				     if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
			
 
				         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
			
 
				     }
			
 
				 
			
 
				     gpt_params_handle_model_default(params);
			
 
				 
			
 
				-    gpt_params_handle_hf_token(params);
			
 
				+    if (params.hf_token.empty()) {
			
 
				+        get_env("HF_TOKEN", params.hf_token);
			
 
				+    }
			
 
				 
			
 
				     if (params.escape) {
			
 
				         string_process_escapes(params.prompt);
			
@@ -293,6 +430,32 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 
				     return true;
			
 
				 }
			
 
				 
			
 
				+void gpt_params_parse_from_env(gpt_params & params) {
			
 
				+    // we only care about server-related params for now
			
 
				+    get_env("LLAMA_ARG_MODEL",            params.model);
			
 
				+    get_env("LLAMA_ARG_MODEL_URL",        params.model_url);
			
 
				+    get_env("LLAMA_ARG_MODEL_ALIAS",      params.model_alias);
			
 
				+    get_env("LLAMA_ARG_HF_REPO",          params.hf_repo);
			
 
				+    get_env("LLAMA_ARG_HF_FILE",          params.hf_file);
			
 
				+    get_env("LLAMA_ARG_THREADS",          params.cpuparams.n_threads);
			
 
				+    get_env("LLAMA_ARG_CTX_SIZE",         params.n_ctx);
			
 
				+    get_env("LLAMA_ARG_N_PARALLEL",       params.n_parallel);
			
 
				+    get_env("LLAMA_ARG_BATCH",            params.n_batch);
			
 
				+    get_env("LLAMA_ARG_UBATCH",           params.n_ubatch);
			
 
				+    get_env("LLAMA_ARG_N_GPU_LAYERS",     params.n_gpu_layers);
			
 
				+    get_env("LLAMA_ARG_THREADS_HTTP",     params.n_threads_http);
			
 
				+    get_env("LLAMA_ARG_CHAT_TEMPLATE",    params.chat_template);
			
 
				+    get_env("LLAMA_ARG_N_PREDICT",        params.n_predict);
			
 
				+    get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
			
 
				+    get_env("LLAMA_ARG_ENDPOINT_SLOTS",   params.endpoint_slots);
			
 
				+    get_env("LLAMA_ARG_EMBEDDINGS",       params.embedding);
			
 
				+    get_env("LLAMA_ARG_FLASH_ATTN",       params.flash_attn);
			
 
				+    get_env("LLAMA_ARG_DEFRAG_THOLD",     params.defrag_thold);
			
 
				+    get_env("LLAMA_ARG_CONT_BATCHING",    params.cont_batching);
			
 
				+    get_env("LLAMA_ARG_HOST",             params.hostname);
			
 
				+    get_env("LLAMA_ARG_PORT",             params.port);
			
 
				+}
			
 
				+
			
 
				 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
			
 
				     const auto params_org = params; // the example can modify the default params
			
 
				 
			
@@ -311,6 +474,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				     return true;
			
 
				 }
			
 
				 
			
 
				+bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
			
 
				+    size_t dash_loc = range.find('-');
			
 
				+    if (dash_loc == std::string::npos) {
			
 
				+        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    size_t start_i;
			
 
				+    size_t end_i;
			
 
				+
			
 
				+    if (dash_loc == 0) {
			
 
				+        start_i = 0;
			
 
				+    } else {
			
 
				+        start_i = std::stoull(range.substr(0, dash_loc));
			
 
				+        if (start_i >= GGML_MAX_N_THREADS) {
			
 
				+            fprintf(stderr, "Start index out of bounds!\n");
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (dash_loc == range.length() - 1) {
			
 
				+        end_i = GGML_MAX_N_THREADS - 1;
			
 
				+    } else {
			
 
				+        end_i = std::stoull(range.substr(dash_loc + 1));
			
 
				+        if (end_i >= GGML_MAX_N_THREADS) {
			
 
				+            fprintf(stderr, "End index out of bounds!\n");
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for (size_t i = start_i; i <= end_i; i++) {
			
 
				+        boolmask[i] = true;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
			
 
				+    // Discard potential 0x prefix
			
 
				+    size_t start_i = 0;
			
 
				+    if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
			
 
				+        start_i = 2;
			
 
				+    }
			
 
				+
			
 
				+    size_t num_digits = mask.length() - start_i;
			
 
				+    if (num_digits > 128) num_digits = 128;
			
 
				+
			
 
				+    size_t end_i = num_digits + start_i;
			
 
				+
			
 
				+    for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
			
 
				+        char c = mask.at(i);
			
 
				+        int8_t id = c;
			
 
				+
			
 
				+        if ((c >= '0' && c <= '9')) {
			
 
				+            id -= '0';
			
 
				+        } else if (c >= 'a' && c <= 'f') {
			
 
				+            id -= 'a' - 10;
			
 
				+        } else if (c >= 'A' && c <= 'F') {
			
 
				+            id -= 'A' - 10;
			
 
				+        } else {
			
 
				+            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        boolmask[  n  ] = boolmask[  n  ] || ((id & 8) != 0);
			
 
				+        boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
			
 
				+        boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
			
 
				+        boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				 #define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
			
 
				 
			
 
				 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
			
@@ -327,36 +563,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
				     }
			
 
				     if (arg == "-t" || arg == "--threads") {
			
 
				         CHECK_ARG
			
 
				-        params.n_threads = std::stoi(argv[i]);
			
 
				-        if (params.n_threads <= 0) {
			
 
				-            params.n_threads = std::thread::hardware_concurrency();
			
 
				+        params.cpuparams.n_threads = std::stoi(argv[i]);
			
 
				+        if (params.cpuparams.n_threads <= 0) {
			
 
				+            params.cpuparams.n_threads = std::thread::hardware_concurrency();
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				+    if (arg == "-C" || arg == "--cpu-mask") {
			
 
				+        CHECK_ARG
			
 
				+        std::string mask = argv[i];
			
 
				+        params.cpuparams.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "-Cr" || arg == "--cpu-range") {
			
 
				+        CHECK_ARG
			
 
				+        std::string range = argv[i];
			
 
				+        params.cpuparams.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--prio") {
			
 
				+        CHECK_ARG
			
 
				+        params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--cpu-strict") {
			
 
				+        CHECK_ARG
			
 
				+        params.cpuparams.strict_cpu = std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--poll") {
			
 
				+        CHECK_ARG
			
 
				+        params.cpuparams.poll = std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				     if (arg == "-tb" || arg == "--threads-batch") {
			
 
				         CHECK_ARG
			
 
				-        params.n_threads_batch = std::stoi(argv[i]);
			
 
				-        if (params.n_threads_batch <= 0) {
			
 
				-            params.n_threads_batch = std::thread::hardware_concurrency();
			
 
				+        params.cpuparams_batch.n_threads = std::stoi(argv[i]);
			
 
				+        if (params.cpuparams_batch.n_threads <= 0) {
			
 
				+            params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				+    if (arg == "-Cb" || arg == "--cpu-mask-batch") {
			
 
				+        CHECK_ARG
			
 
				+        std::string mask = argv[i];
			
 
				+        params.cpuparams_batch.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "-Crb" || arg == "--cpu-range_batch") {
			
 
				+        CHECK_ARG
			
 
				+        std::string range = argv[i];
			
 
				+        params.cpuparams_batch.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--prio-batch") {
			
 
				+        CHECK_ARG
			
 
				+        params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--cpu-strict-batch") {
			
 
				+        params.cpuparams_batch.strict_cpu = true;
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--poll-batch") {
			
 
				+        CHECK_ARG
			
 
				+        params.cpuparams_batch.poll = std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				     if (arg == "-td" || arg == "--threads-draft") {
			
 
				         CHECK_ARG
			
 
				-        params.n_threads_draft = std::stoi(argv[i]);
			
 
				-        if (params.n_threads_draft <= 0) {
			
 
				-            params.n_threads_draft = std::thread::hardware_concurrency();
			
 
				+        params.draft_cpuparams.n_threads = std::stoi(argv[i]);
			
 
				+        if (params.draft_cpuparams.n_threads <= 0) {
			
 
				+            params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
			
 
				         }
			
 
				         return true;
			
 
				+    }
			
 
				+        if (arg == "-Cd" || arg == "--cpu-mask-draft") {
			
 
				+        CHECK_ARG
			
 
				+        std::string mask = argv[i];
			
 
				+        params.draft_cpuparams.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "-Crd" || arg == "--cpu-range-draft") {
			
 
				+        CHECK_ARG
			
 
				+        std::string range = argv[i];
			
 
				+        params.draft_cpuparams.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--prio-draft") {
			
 
				+        CHECK_ARG
			
 
				+        params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--cpu-strict-draft") {
			
 
				+        params.draft_cpuparams.strict_cpu = true;
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--poll-draft") {
			
 
				+        CHECK_ARG
			
 
				+        params.draft_cpuparams.poll = std::stoul(argv[i]);
			
 
				+        return true;
			
 
				     }
			
 
				     if (arg == "-tbd" || arg == "--threads-batch-draft") {
			
 
				         CHECK_ARG
			
 
				-        params.n_threads_batch_draft = std::stoi(argv[i]);
			
 
				-        if (params.n_threads_batch_draft <= 0) {
			
 
				-            params.n_threads_batch_draft = std::thread::hardware_concurrency();
			
 
				+        params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
			
 
				+        if (params.draft_cpuparams_batch.n_threads <= 0) {
			
 
				+            params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				+    if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
			
 
				+        CHECK_ARG
			
 
				+        std::string range = argv[i];
			
 
				+        params.draft_cpuparams_batch.mask_valid = true;
			
 
				+        invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--prio-batch-draft") {
			
 
				+        CHECK_ARG
			
 
				+        params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--cpu-strict-batch-draft") {
			
 
				+        params.draft_cpuparams_batch.strict_cpu = true;
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--poll-batch-draft") {
			
 
				+        CHECK_ARG
			
 
				+        params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
			
 
				+        return true;
			
 
				+    }
			
 
				     if (arg == "-p" || arg == "--prompt") {
			
 
				         CHECK_ARG
			
 
				         params.prompt = argv[i];
			
@@ -851,7 +1193,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				-    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
			
 
				+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--n-gpu-layers-draft") {
			
 
				         CHECK_ARG
			
 
				         params.n_gpu_layers_draft = std::stoi(argv[i]);
			
 
				         if (!llama_supports_gpu_offload()) {
			
@@ -1441,11 +1783,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
			
 
				     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
			
 
				     options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
			
 
				-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
			
 
				+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
			
 
				     options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
			
 
				     options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
			
 
				-    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
			
 
				-                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
			
 
				+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
			
 
				+
			
 
				+#ifndef GGML_USE_OPENMP
			
 
				+    // these options are available only with the internal threadpool
			
 
				+    options.push_back({ "*",           "-C,    --cpu-mask M",            "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
			
 
				+    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",       "range of CPUs for affinity. Complements --cpu-mask"});
			
 
				+    options.push_back({ "*",           "       --cpu-strict <0|1>",      "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
			
 
				+    options.push_back({ "*",           "       --priority N",            "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
			
 
				+    options.push_back({ "*",           "       --poll <0...100>",        "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
			
 
				+
			
 
				+    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",      "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
			
 
				+    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
			
 
				+    options.push_back({ "*",           "       --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
			
 
				+    options.push_back({ "*",           "       --priority-batch N",      "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
			
 
				+    options.push_back({ "*",           "       --poll-batch <0|1>",      "use polling to wait for work (default: same as --poll"});
			
 
				+
			
 
				+    options.push_back({ "speculative", "-Cd,   --cpu-mask-draft M",      "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
			
 
				+    options.push_back({ "speculative", "-Crd,  --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
			
 
				+    options.push_back({ "speculative", "       --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
			
 
				+    options.push_back({ "speculative", "       --priority-draft N",      "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
			
 
				+    options.push_back({ "speculative", "       --poll-draft <0|1>",      "Use polling to wait for draft model work (default: same as --poll])"});
			
 
				+
			
 
				+    options.push_back({ "speculative", "-Cbd,  --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
			
 
				+    options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
			
 
				+                                                                         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
			
 
				+    options.push_back({ "speculative", "       --cpu-strict-batch-draft <0|1>",
			
 
				+                                                                         "Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
			
 
				+    options.push_back({ "speculative", "       --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
			
 
				+    options.push_back({ "speculative", "       --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
			
 
				+#endif // GGML_USE_OPENMP
			
 
				+
			
 
				     options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
			
 
				     options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
			
 
				     options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
			
@@ -1717,7 +2088,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				     options.push_back({ "export-lora", "-m,    --model",                "model path from which to load base model (default '%s')", params.model.c_str() });
			
 
				     options.push_back({ "export-lora", "       --lora FNAME",           "path to LoRA adapter  (can be repeated to use multiple adapters)" });
			
 
				     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" });
			
 
				-    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during computation (default: %d)", params.n_threads });
			
 
				     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() });
			
 
				 
			
 
				     printf("usage: %s [options]\n", argv[0]);
			
@@ -1749,11 +2119,17 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				 std::string gpt_params_get_system_info(const gpt_params & params) {
			
 
				     std::ostringstream os;
			
 
				 
			
 
				-    os << "system_info: n_threads = " << params.n_threads;
			
 
				-    if (params.n_threads_batch != -1) {
			
 
				-        os << " (n_threads_batch = " << params.n_threads_batch << ")";
			
 
				+    os << "system_info: n_threads = " << params.cpuparams.n_threads;
			
 
				+    if (params.cpuparams_batch.n_threads != -1) {
			
 
				+        os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
			
 
				     }
			
 
				+#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
			
 
				+    // TODO: windows + arm64 + mingw64
			
 
				+    DWORD logicalProcessorCount = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
			
 
				+    os << " / " << logicalProcessorCount << " | " << llama_print_system_info();
			
 
				+#else
			
 
				     os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();
			
 
				+#endif
			
 
				 
			
 
				     return os.str();
			
 
				 }
			
@@ -1803,6 +2179,23 @@ std::string string_get_sortable_timestamp() {
 
				     return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
			
 
				 }
			
 
				 
			
 
				+void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
			
 
				+    if (search.empty()) {
			
 
				+        return;
			
 
				+    }
			
 
				+    std::string builder;
			
 
				+    builder.reserve(s.length());
			
 
				+    size_t pos = 0;
			
 
				+    size_t last_pos = 0;
			
 
				+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
			
 
				+        builder.append(s, last_pos, pos - last_pos);
			
 
				+        builder.append(replace);
			
 
				+        last_pos = pos + search.length();
			
 
				+    }
			
 
				+    builder.append(s, last_pos, std::string::npos);
			
 
				+    s = std::move(builder);
			
 
				+}
			
 
				+
			
 
				 void string_process_escapes(std::string & input) {
			
 
				     std::size_t input_len = input.length();
			
 
				     std::size_t output_idx = 0;
			
@@ -2171,7 +2564,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
				             tmp.clear();
			
 
				             tmp.push_back(decoder_start_token_id);
			
 
				         }
			
 
				-        llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
			
 
				+        if (llama_model_has_decoder(model)) {
			
 
				+            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
			
 
				+        }
			
 
				         llama_kv_cache_clear(lctx);
			
 
				         llama_synchronize(lctx);
			
 
				         llama_reset_timings(lctx);
			
@@ -2250,8 +2645,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
				     cparams.n_seq_max         = params.n_parallel;
			
 
				     cparams.n_batch           = params.n_batch;
			
 
				     cparams.n_ubatch          = params.n_ubatch;
			
 
				-    cparams.n_threads         = params.n_threads;
			
 
				-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
			
 
				+    cparams.n_threads         = params.cpuparams.n_threads;
			
 
				+    cparams.n_threads_batch   = params.cpuparams_batch.n_threads == -1 ?
			
 
				+                                    params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
			
 
				     cparams.seed              = params.seed;
			
 
				     cparams.logits_all        = params.logits_all;
			
 
				     cparams.embeddings        = params.embedding;
			
@@ -2277,6 +2673,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
				     return cparams;
			
 
				 }
			
 
				 
			
 
				+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
			
 
				+    struct ggml_threadpool_params tpp;
			
 
				+
			
 
				+    ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
			
 
				+
			
 
				+    if (params.mask_valid) {
			
 
				+        std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
			
 
				+    }
			
 
				+
			
 
				+    tpp.prio       = params.priority;
			
 
				+    tpp.poll       = params.poll;
			
 
				+    tpp.strict_cpu = params.strict_cpu;
			
 
				+
			
 
				+    return tpp;
			
 
				+}
			
 
				+
			
 
				 #ifdef LLAMA_USE_CURL
			
 
				 
			
 
				 static bool starts_with(const std::string & str, const std::string & prefix) {
			
@@ -2715,12 +3127,6 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
 
				     return text;
			
 
				 }
			
 
				 
			
 
				-bool llama_should_add_bos_token(const llama_model * model) {
			
 
				-    const int add_bos = llama_add_bos_token(model);
			
 
				-
			
 
				-    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
			
 
				-}
			
 
				-
			
 
				 //
			
 
				 // Chat template utils
			
 
				 //
			
@@ -3272,7 +3678,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
 
				     yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
			
 
				 
			
 
				     fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
			
 
				-    fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
			
 
				+    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
			
 
				     fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
			
 
				     fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
			
 
				     fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
			
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -93,13 +93,18 @@ enum dimre_method {
 
				     DIMRE_METHOD_MEAN,
			
 
				 };
			
 
				 
			
 
				+struct cpu_params {
			
 
				+    int      n_threads                   = -1;
			
 
				+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
			
 
				+    bool     mask_valid                  = false;   // Default: any CPU
			
 
				+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
			
 
				+    bool     strict_cpu                  = false;   // Use strict CPU placement
			
 
				+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
			
 
				+};
			
 
				+
			
 
				 struct gpt_params {
			
 
				     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
			
 
				 
			
 
				-    int32_t n_threads             = cpu_get_num_math();
			
 
				-    int32_t n_threads_draft       =    -1;
			
 
				-    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
			
 
				-    int32_t n_threads_batch_draft =    -1;
			
 
				     int32_t n_predict             =    -1; // new tokens to predict
			
 
				     int32_t n_ctx                 =     0; // context size
			
 
				     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
			
@@ -126,6 +131,11 @@ struct gpt_params {
 
				     int32_t yarn_orig_ctx         =     0; // YaRN original context length
			
 
				     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
			
 
				 
			
 
				+    struct cpu_params cpuparams;
			
 
				+    struct cpu_params cpuparams_batch;
			
 
				+    struct cpu_params draft_cpuparams;
			
 
				+    struct cpu_params draft_cpuparams_batch;
			
 
				+
			
 
				     ggml_backend_sched_eval_callback cb_eval = nullptr;
			
 
				     void * cb_eval_user_data                 = nullptr;
			
 
				 
			
@@ -230,7 +240,7 @@ struct gpt_params {
 
				     int32_t port           = 8080;         // server listens on this network port
			
 
				     int32_t timeout_read   = 600;          // http read timeout in seconds
			
 
				     int32_t timeout_write  = timeout_read; // http write timeout in seconds
			
 
				-    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
			
 
				+    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
			
 
				 
			
 
				     std::string hostname      = "127.0.0.1";
			
 
				     std::string public_path   = "";
			
@@ -293,7 +303,7 @@ struct gpt_params {
 
				     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
			
 
				 };
			
 
				 
			
 
				-void gpt_params_handle_hf_token(gpt_params & params);
			
 
				+void gpt_params_parse_from_env(gpt_params & params);
			
 
				 void gpt_params_handle_model_default(gpt_params & params);
			
 
				 
			
 
				 bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
			
@@ -303,6 +313,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
 
				 
			
 
				 std::string gpt_params_get_system_info(const gpt_params & params);
			
 
				 
			
 
				+bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
			
 
				+bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
			
 
				+void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
			
 
				+bool set_process_priority(enum ggml_sched_priority prio);
			
 
				+
			
 
				 //
			
 
				 // String utils
			
 
				 //
			
@@ -312,6 +327,8 @@ std::vector<std::string> string_split(std::string input, char separator);
 
				 std::string string_strip(const std::string & str);
			
 
				 std::string string_get_sortable_timestamp();
			
 
				 
			
 
				+void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
			
 
				+
			
 
				 template<class T>
			
 
				 static std::vector<T> string_split(const std::string & str, char delim) {
			
 
				     std::vector<T> values;
			
@@ -351,8 +368,9 @@ struct llama_init_result {
 
				 
			
 
				 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
			
 
				 
			
 
				-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
			
 
				-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
			
 
				+struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
			
 
				+struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
			
 
				+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
			
 
				 
			
 
				 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
			
 
				 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
			
@@ -404,10 +422,6 @@ std::string llama_detokenize(
 
				         const std::vector<llama_token> & tokens,
			
 
				                                   bool   special = true);
			
 
				 
			
 
				-// Uses the value from the model metadata if possible, otherwise
			
 
				-// defaults to true when model type is SPM, otherwise false.
			
 
				-bool llama_should_add_bos_token(const llama_model * model);
			
 
				-
			
 
				 //
			
 
				 // Chat template utils
			
 
				 //
			
--- a/llama/ggml-aarch64.c
+++ b/llama/ggml-aarch64.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -42,6 +42,8 @@
 
				 
			
 
				 #if defined(__GNUC__)
			
 
				 #pragma GCC diagnostic ignored "-Woverlength-strings"
			
 
				+#elif defined(_MSC_VER)
			
 
				+#pragma warning(disable: 4244 4267) // possible loss of data
			
 
				 #endif
			
 
				 
			
 
				 #define UNUSED GGML_UNUSED
			
@@ -361,33 +363,18 @@ static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict ds
 
				 }
			
 
				 
			
 
				 size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				-    if (!quant_weights) {
			
 
				-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
			
 
				-    }
			
 
				-    else {
			
 
				-        assert(false);
			
 
				-        return 0;
			
 
				-    }
			
 
				+    UNUSED(quant_weights);
			
 
				+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
			
 
				 }
			
 
				 
			
 
				 size_t quantize_q4_0_4x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				-    if (!quant_weights) {
			
 
				-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
			
 
				-    }
			
 
				-    else {
			
 
				-        assert(false);
			
 
				-        return 0;
			
 
				-    }
			
 
				+    UNUSED(quant_weights);
			
 
				+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 8);
			
 
				 }
			
 
				 
			
 
				 size_t quantize_q4_0_8x8(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
			
 
				-    if (!quant_weights) {
			
 
				-        return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
			
 
				-    }
			
 
				-    else {
			
 
				-        assert(false);
			
 
				-        return 0;
			
 
				-    }
			
 
				+    UNUSED(quant_weights);
			
 
				+    return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 8, 8);
			
 
				 }
			
 
				 
			
 
				 void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
			
--- a/llama/ggml-aarch64.h
+++ b/llama/ggml-aarch64.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -33,8 +33,8 @@ extern "C" {
 
				 #endif
			
 
				 
			
 
				 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
			
 
				-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
			
 
				-typedef struct ggml_backend * ggml_backend_t;
			
 
				+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
			
 
				+typedef struct             ggml_backend * ggml_backend_t;
			
 
				 
			
 
				 // Tensor allocator
			
 
				 struct ggml_tallocr {
			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -382,15 +382,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
 
				     }
			
 
				 
			
 
				     // an async copy would normally happen after all the queued operations on both backends are completed
			
 
				-    // sync src, set_async dst
			
 
				-    if (ggml_backend_buffer_is_host(src->buffer)) {
			
 
				-        ggml_backend_synchronize(backend_src);
			
 
				-        ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
			
 
				-    } else {
			
 
				-        ggml_backend_synchronize(backend_src);
			
 
				-        ggml_backend_tensor_copy(src, dst);
			
 
				-        ggml_backend_synchronize(backend_dst);
			
 
				-    }
			
 
				+    // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
			
 
				+    ggml_backend_synchronize(backend_src);
			
 
				+    ggml_backend_synchronize(backend_dst);
			
 
				+    ggml_backend_tensor_copy(src, dst);
			
 
				 }
			
 
				 
			
 
				 // events
			
@@ -758,9 +753,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
 
				 #endif
			
 
				 
			
 
				 struct ggml_backend_cpu_context {
			
 
				-    int n_threads;
			
 
				-    void * work_data;
			
 
				-    size_t work_size;
			
 
				+    int                 n_threads;
			
 
				+    ggml_threadpool_t   threadpool;
			
 
				+
			
 
				+    void *              work_data;
			
 
				+    size_t              work_size;
			
 
				 
			
 
				     ggml_abort_callback abort_callback;
			
 
				     void *              abort_callback_data;
			
@@ -795,7 +792,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
				 
			
 
				     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
			
 
				 
			
 
				-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
			
 
				+    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
			
 
				     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
			
 
				 
			
 
				     if (cpu_plan->cplan.work_size > 0) {
			
@@ -832,7 +829,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
 
				 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
			
 
				     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
			
 
				 
			
 
				-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
			
 
				+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
			
 
				 
			
 
				     if (cpu_ctx->work_size < cplan.work_size) {
			
 
				         free(cpu_ctx->work_data);
			
@@ -909,6 +906,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
 
				     }
			
 
				 
			
 
				     ctx->n_threads           = GGML_DEFAULT_N_THREADS;
			
 
				+    ctx->threadpool          = NULL;
			
 
				     ctx->work_data           = NULL;
			
 
				     ctx->work_size           = 0;
			
 
				     ctx->abort_callback      = NULL;
			
@@ -939,6 +937,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
 
				     ctx->n_threads = n_threads;
			
 
				 }
			
 
				 
			
 
				+void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
			
 
				+    GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
			
 
				+
			
 
				+    struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
			
 
				+
			
 
				+    if (ctx->threadpool && ctx->threadpool != threadpool) {
			
 
				+        // already had a different threadpool, pause/suspend it before switching
			
 
				+        ggml_threadpool_pause(ctx->threadpool);
			
 
				+    }
			
 
				+    ctx->threadpool = threadpool;
			
 
				+}
			
 
				+
			
 
				 void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
			
 
				     GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
			
 
				 
			
@@ -1054,10 +1064,6 @@ static bool ggml_is_view_op(enum ggml_op op) {
 
				 #define GGML_SCHED_MAX_BACKENDS 16
			
 
				 #endif
			
 
				 
			
 
				-#ifndef GGML_SCHED_MAX_SPLITS
			
 
				-#define GGML_SCHED_MAX_SPLITS 2048
			
 
				-#endif
			
 
				-
			
 
				 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
			
 
				 #define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
			
 
				 #endif
			
@@ -1161,7 +1167,8 @@ static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, co
 
				 }
			
 
				 
			
 
				 #if 0
			
 
				-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
			
 
				+#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
			
 
				+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
			
 
				 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
			
 
				 #define GET_CAUSE(node) causes[hash_id(node)]
			
 
				 #else
			
@@ -1585,7 +1592,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
				                     sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
			
 
				                     GGML_ASSERT(sched->splits != NULL);
			
 
				                 }
			
 
				-                GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
			
 
				                 split = &sched->splits[i_split];
			
 
				                 split->backend_id = node_backend_id;
			
 
				                 split->i_start = i;
			
@@ -1813,7 +1819,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
 
				                 } else {
			
 
				                     ggml_backend_synchronize(split_backend);
			
 
				                 }
			
 
				-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
			
 
				+                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
			
 
				+                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
			
 
				+                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
			
 
				+                    ggml_backend_synchronize(input_backend);
			
 
				+                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
			
 
				+                        ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
			
 
				+                    } else {
			
 
				+                        ggml_backend_synchronize(split_backend);
			
 
				+                    }
			
 
				+                    ggml_backend_tensor_copy(input, input_cpy);
			
 
				+                }
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -1891,13 +1907,14 @@ ggml_backend_sched_t ggml_backend_sched_new(
 
				     sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
			
 
				     sched->hv_tensor_copies      = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
			
 
				 
			
 
				-    const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
			
 
				+    const size_t ggml_sched_max_splits = graph_size; // at most there is one split for each node in the graph
			
 
				+    const size_t nodes_size = graph_size + ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
			
 
				     sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
			
 
				     sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
			
 
				     sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
			
 
				     sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
			
 
				 
			
 
				-    sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
			
 
				+    sched->context_buffer_size = ggml_sched_max_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
			
 
				     sched->context_buffer = malloc(sched->context_buffer_size);
			
 
				 
			
 
				     const int initial_splits_capacity = 16;
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -89,6 +89,7 @@ extern "C" {
 
				     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				 
			
 
				+    // "offset" refers to the offset of the tensor data for setting/getting data
			
 
				     GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				     GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				 
			
@@ -128,6 +129,7 @@ extern "C" {
 
				 
			
 
				     GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
			
 
				     GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
			
 
				+    GGML_API           void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
			
 
				     GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
			
 
				 
			
 
				     // Create a backend buffer from an existing pointer
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -35,8 +35,10 @@
 
				 #include "ggml-cuda/binbcast.cuh"
			
 
				 #include "ggml-cuda/clamp.cuh"
			
 
				 #include "ggml-cuda/concat.cuh"
			
 
				+#include "ggml-cuda/conv-transpose-1d.cuh"
			
 
				 #include "ggml-cuda/convert.cuh"
			
 
				 #include "ggml-cuda/cpy.cuh"
			
 
				+#include "ggml-cuda/cross-entropy-loss.cuh"
			
 
				 #include "ggml-cuda/diagmask.cuh"
			
 
				 #include "ggml-cuda/dmmv.cuh"
			
 
				 #include "ggml-cuda/fattn.cuh"
			
@@ -55,7 +57,6 @@
 
				 #include "ggml-cuda/tsembd.cuh"
			
 
				 #include "ggml-cuda/unary.cuh"
			
 
				 #include "ggml-cuda/upscale.cuh"
			
 
				-#include "ggml-cuda/conv-transpose-1d.cuh"
			
 
				 
			
 
				 #include <algorithm>
			
 
				 #include <array>
			
@@ -2211,6 +2212,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_ADD:
			
 
				             ggml_cuda_op_add(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_SUB:
			
 
				+            ggml_cuda_op_sub(ctx, dst);
			
 
				+            break;
			
 
				         case GGML_OP_ACC:
			
 
				             ggml_cuda_op_acc(ctx, dst);
			
 
				             break;
			
@@ -2297,6 +2301,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_SQRT:
			
 
				             ggml_cuda_op_sqrt(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_SIN:
			
 
				+            ggml_cuda_op_sin(ctx, dst);
			
 
				+            break;
			
 
				+        case GGML_OP_COS:
			
 
				+            ggml_cuda_op_cos(ctx, dst);
			
 
				+            break;
			
 
				         case GGML_OP_CLAMP:
			
 
				             ggml_cuda_op_clamp(ctx, dst);
			
 
				             break;
			
@@ -2333,6 +2343,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_FLASH_ATTN_EXT:
			
 
				             ggml_cuda_flash_attn_ext(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				+            ggml_cuda_cross_entropy_loss(ctx, dst);
			
 
				+            break;
			
 
				         default:
			
 
				             return false;
			
 
				     }
			
@@ -2388,33 +2401,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
 
				 }
			
 
				 
			
 
				 GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
			
 
				-    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
			
 
				-
			
 
				     ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
			
 
				     ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
			
 
				 
			
 
				-    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
			
 
				+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
 
				-    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
			
 
				+    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
			
 
				         return false;
			
 
				     }
			
 
				 
			
 
				-    // device -> device
			
 
				+    // device -> device copy
			
 
				     ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
			
 
				     ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
			
 
				 
			
 
				-    if (backend_src != backend_dst) {
			
 
				-        ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
			
 
				-        ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
			
 
				+    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
			
 
				+    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
			
 
				 
			
 
				-        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
			
 
				-        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
			
 
				+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
			
 
				+#ifndef NDEBUG
			
 
				+        GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
			
 
				+#endif
			
 
				+        return false;
			
 
				+    }
			
 
				 
			
 
				+    if (backend_src != backend_dst) {
			
 
				         // copy on src stream
			
 
				         if (cuda_ctx_src->device == cuda_ctx_dst->device) {
			
 
				-            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
			
 
				+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
			
 
				         } else {
			
 
				 #ifdef GGML_CUDA_NO_PEER_COPY
			
 
				             return false;
			
@@ -2423,7 +2438,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 
				 #endif
			
 
				         }
			
 
				 
			
 
				-        // record event on src stream
			
 
				+        // record event on src stream after the copy
			
 
				         if (!cuda_ctx_src->copy_event) {
			
 
				             ggml_cuda_set_device(cuda_ctx_src->device);
			
 
				             CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
			
@@ -2435,7 +2450,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 
				         CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
			
 
				     } else {
			
 
				         // src and dst are on the same backend
			
 
				-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
			
 
				+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
			
 
				     }
			
 
				     return true;
			
 
				 }
			
@@ -2638,6 +2653,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
				                 assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
			
 
				                 for (int j = 0; j < GGML_MAX_SRC; j++) {
			
 
				                     if (node->src[j] != nullptr) {
			
 
				+                        assert(node->src[j]->buffer);
			
 
				                         assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
			
 
				                     }
			
 
				                 }
			
@@ -2772,11 +2788,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				         case GGML_OP_MUL_MAT_ID:
			
 
				             {
			
 
				                 struct ggml_tensor * a = op->src[0];
			
 
				-                if (op->op == GGML_OP_MUL_MAT) {
			
 
				-                    struct ggml_tensor * b = op->src[1];
			
 
				-                    if (a->ne[3] != b->ne[3]) {
			
 
				-                        return false;
			
 
				-                    }
			
 
				+                struct ggml_tensor * b = op->src[1];
			
 
				+                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+                if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
			
 
				+                    return false;
			
 
				                 }
			
 
				                 switch (a->type) {
			
 
				                     case GGML_TYPE_F32:
			
@@ -2880,12 +2897,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				         case GGML_OP_TRANSPOSE:
			
 
				         case GGML_OP_NORM:
			
 
				         case GGML_OP_ADD:
			
 
				+        case GGML_OP_SUB:
			
 
				         case GGML_OP_MUL:
			
 
				         case GGML_OP_DIV:
			
 
				         case GGML_OP_RMS_NORM:
			
 
				         case GGML_OP_SCALE:
			
 
				         case GGML_OP_SQR:
			
 
				         case GGML_OP_SQRT:
			
 
				+        case GGML_OP_SIN:
			
 
				+        case GGML_OP_COS:
			
 
				         case GGML_OP_CLAMP:
			
 
				         case GGML_OP_CONT:
			
 
				         case GGML_OP_DIAG_MASK_INF:
			
@@ -2907,7 +2927,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				             return true;
			
 
				         case GGML_OP_FLASH_ATTN_EXT:
			
 
				 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				-            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
			
 
				+            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
			
 
				 #else
			
 
				             if (op->src[0]->ne[0] == 128) {
			
 
				                 return true;
			
@@ -2917,6 +2937,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				             }
			
 
				             return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
			
 
				                 op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
			
 
				+        case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				+            return true;
			
 
				 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				         default:
			
 
				             return false;
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -35,6 +35,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
 
				     return a + b;
			
 
				 }
			
 
				 
			
 
				+static __device__ __forceinline__ float op_sub(const float a, const float b) {
			
 
				+    return a - b;
			
 
				+}
			
 
				+
			
 
				 static __device__ __forceinline__ float op_mul(const float a, const float b) {
			
 
				     return a * b;
			
 
				 }
			
@@ -297,6 +301,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				 }
			
 
				 
			
 
				+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				+}
			
 
				+
			
 
				 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				 }
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -28,5 +28,6 @@
 
				 
			
 
				 void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cross-entropy-loss.cu
+++ b/llama/ggml-cuda/cross-entropy-loss.cu
@@ -0,0 +1,132 @@
 
				+/**
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+#include "cross-entropy-loss.cuh"
			
 
				+#include "sumrows.cuh"
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <cstdint>
			
 
				+
			
 
				+static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
			
 
				+    const int warp_id = threadIdx.x / WARP_SIZE;
			
 
				+    const int lane_id = threadIdx.x % WARP_SIZE;
			
 
				+    const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
			
 
				+
			
 
				+    const int ne_tmp = WARP_SIZE*nclasses;
			
 
				+
			
 
				+    extern __shared__ float tmp_all[];
			
 
				+    float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
			
 
				+    float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
			
 
				+
			
 
				+    // Each warp first loads ne_tmp logits/labels into shared memory:
			
 
				+    for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
			
 
				+        const int ig = i0*nclasses + i; // ig == i global
			
 
				+
			
 
				+        tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
			
 
				+        tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
			
 
				+    }
			
 
				+
			
 
				+    // Each thread in the warp then calculates the cross entropy loss for a single row.
			
 
				+    // TODO: pad in order to avoid shared memory bank conflicts.
			
 
				+
			
 
				+    // Find maximum for softmax:
			
 
				+    float max = -INFINITY;
			
 
				+    for (int i = 0; i < nclasses; ++i) {
			
 
				+        max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
			
 
				+    }
			
 
				+
			
 
				+    // Calculate log(softmax(logits)) which is just logits - max:
			
 
				+    float sum = 0.0f;
			
 
				+    for (int i = 0; i < nclasses; ++i) {
			
 
				+        float val = tmp_logits[lane_id*nclasses + i] - max;
			
 
				+        sum += expf(val);
			
 
				+        tmp_logits[lane_id*nclasses + i] = val;
			
 
				+    }
			
 
				+    sum = logf(sum);
			
 
				+
			
 
				+    // log(exp(logits - max) / sum) = (logits - max) - log(sum)
			
 
				+    float loss = 0.0f;
			
 
				+    for (int i = 0; i < nclasses; ++i) {
			
 
				+        loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
			
 
				+    }
			
 
				+    loss = -warp_reduce_sum(loss) / (float)k;
			
 
				+
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (lane_id == 0) {
			
 
				+        tmp_all[warp_id] = loss;
			
 
				+    }
			
 
				+
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (warp_id != 0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
			
 
				+    loss = warp_reduce_sum(loss);
			
 
				+
			
 
				+    if (lane_id != 0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    dst[blockIdx.x] = loss;
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src1));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(dst));
			
 
				+
			
 
				+    const int64_t ne00  = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    const float * src0_d = (const float *) src0->data;
			
 
				+    const float * src1_d = (const float *) src1->data;
			
 
				+    float       * dst_d  = (float       *) dst->data;
			
 
				+
			
 
				+    ggml_cuda_pool & pool = ctx.pool();
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
			
 
				+    const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
			
 
				+    const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
			
 
				+
			
 
				+    ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
			
 
				+
			
 
				+    cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
			
 
				+
			
 
				+    // Combine results from individual blocks:
			
 
				+    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -0,0 +1,31 @@
 
				+/**
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
			
 
				+
			
 
				+void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -48,6 +48,7 @@ typedef void (* fattn_kernel_t)(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -683,11 +684,17 @@ void launch_fattn(
 
				     const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
			
 
				     const int  shmem = 0;
			
 
				 
			
 
				-    float scale    = 1.0f;
			
 
				-    float max_bias = 0.0f;
			
 
				+    float scale         = 1.0f;
			
 
				+    float max_bias      = 0.0f;
			
 
				+    float logit_softcap = 0.0f;
			
 
				 
			
 
				-    memcpy(&scale,    (float *) KQV->op_params + 0, sizeof(float));
			
 
				-    memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
			
 
				+    memcpy(&scale,         (float *) KQV->op_params + 0, sizeof(float));
			
 
				+    memcpy(&max_bias,      (float *) KQV->op_params + 1, sizeof(float));
			
 
				+    memcpy(&logit_softcap, (float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				+    if (logit_softcap != 0.0f) {
			
 
				+        scale /= logit_softcap;
			
 
				+    }
			
 
				 
			
 
				     const uint32_t n_head      = Q->ne[2];
			
 
				     const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
			
@@ -701,7 +708,7 @@ void launch_fattn(
 
				         V_data,
			
 
				         mask ? ((const char *) mask->data) : nullptr,
			
 
				         (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
			
 
				-        scale, max_bias, m0, m1, n_head_log2,
			
 
				+        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
			
 
				         Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
			
 
				         K->ne[0], K->ne[1], K->ne[2], K->ne[3],
			
 
				         mask ? mask->ne[1] : 0, mask ?  mask->nb[1] : 0,
			
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -30,7 +30,7 @@
 
				 
			
 
				 #define FATTN_KQ_STRIDE_TILE_F16 64
			
 
				 
			
 
				-template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
			
 
				+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				 __launch_bounds__(nwarps*WARP_SIZE, 1)
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
@@ -46,6 +46,7 @@ static __global__ void flash_attn_tile_ext_f16(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -70,6 +71,12 @@ static __global__ void flash_attn_tile_ext_f16(
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				 #ifdef FP16_AVAILABLE
			
 
				+    // Skip unused kernel variants for faster compilation:
			
 
				+    if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        NO_DEVICE_CODE;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
			
@@ -180,7 +187,13 @@ static __global__ void flash_attn_tile_ext_f16(
 
				             for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
			
 
				                 const int j_KQ = j_KQ_0 + threadIdx.y;
			
 
				 
			
 
				-                half sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
			
 
				+                half sum;
			
 
				+                if (use_logit_softcap) {
			
 
				+                    const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
			
 
				+                    sum = logit_softcap * tanhf(tmp.x + tmp.y);
			
 
				+                } else {
			
 
				+                    sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
			
 
				+                }
			
 
				                 sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
			
 
				 
			
 
				                 kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
			
@@ -296,20 +309,20 @@ static __global__ void flash_attn_tile_ext_f16(
 
				 #endif // FP16_AVAILABLE
			
 
				 }
			
 
				 
			
 
				-template <int cols_per_block, int parallel_blocks>
			
 
				+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
			
 
				 void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     const ggml_tensor * Q = dst->src[0];
			
 
				     switch (Q->ne[0]) {
			
 
				         case  64: {
			
 
				             constexpr int      D = 64;
			
 
				             constexpr int nwarps = 8;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
			
 
				+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
			
 
				             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         } break;
			
 
				         case 128: {
			
 
				             constexpr int      D = 128;
			
 
				             constexpr int nwarps = 8;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks>;
			
 
				+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
			
 
				             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         } break;
			
 
				         default: {
			
@@ -322,24 +335,45 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten
 
				     const ggml_tensor * KQV = dst;
			
 
				     const ggml_tensor * Q   = dst->src[0];
			
 
				 
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				+    const int32_t precision = KQV->op_params[3];
			
 
				     GGML_ASSERT(precision == GGML_PREC_DEFAULT);
			
 
				 
			
 
				+    float logit_softcap;
			
 
				+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				     if (Q->ne[1] <= 16) {
			
 
				         constexpr int cols_per_block = 16;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 32) {
			
 
				         constexpr int cols_per_block = 32;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     constexpr int cols_per_block = 32;
			
 
				     constexpr int parallel_blocks = 1;
			
 
				-    launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+    if (logit_softcap == 0.0f) {
			
 
				+        constexpr bool use_logit_softcap = false;
			
 
				+        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+    } else {
			
 
				+        constexpr bool use_logit_softcap = true;
			
 
				+        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+    }
			
 
				 }
			
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -30,7 +30,7 @@
 
				 
			
 
				 #define FATTN_KQ_STRIDE_TILE_F32 32
			
 
				 
			
 
				-template<int D, int ncols, int nwarps, int parallel_blocks> // D == head size
			
 
				+template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				 __launch_bounds__(nwarps*WARP_SIZE, 1)
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
@@ -46,6 +46,7 @@ static __global__ void flash_attn_tile_ext_f32(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -69,6 +70,12 @@ static __global__ void flash_attn_tile_ext_f32(
 
				         const int ne1,
			
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				+    // Skip unused kernel variants for faster compilation:
			
 
				+    if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        NO_DEVICE_CODE;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
			
@@ -177,6 +184,10 @@ static __global__ void flash_attn_tile_ext_f32(
 
				             for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
			
 
				                 const int j_KQ = j_KQ_0 + threadIdx.y;
			
 
				 
			
 
				+                if (use_logit_softcap) {
			
 
				+                    sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
			
 
				+                }
			
 
				+
			
 
				                 sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
			
 
				 
			
 
				                 kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
			
@@ -293,20 +304,20 @@ static __global__ void flash_attn_tile_ext_f32(
 
				     }
			
 
				 }
			
 
				 
			
 
				-template <int cols_per_block, int parallel_blocks>
			
 
				+template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
			
 
				 void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     const ggml_tensor * Q = dst->src[0];
			
 
				     switch (Q->ne[0]) {
			
 
				         case  64: {
			
 
				             constexpr int      D = 64;
			
 
				             constexpr int nwarps = 8;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
			
 
				+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
			
 
				             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         } break;
			
 
				         case 128: {
			
 
				             constexpr int      D = 128;
			
 
				             constexpr int nwarps = 8;
			
 
				-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
			
 
				+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
			
 
				             launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         } break;
			
 
				         default: {
			
@@ -316,23 +327,45 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
 
				 }
			
 
				 
			
 
				 void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * KQV = dst;
			
 
				     const ggml_tensor * Q = dst->src[0];
			
 
				 
			
 
				+    float logit_softcap;
			
 
				+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				     if (Q->ne[1] <= 16) {
			
 
				         constexpr int cols_per_block = 16;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 32) {
			
 
				         constexpr int cols_per_block = 32;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     constexpr int cols_per_block = 32;
			
 
				     constexpr int parallel_blocks = 1;
			
 
				-    launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks>(ctx, dst);
			
 
				+    if (logit_softcap == 0.0f) {
			
 
				+        constexpr bool use_logit_softcap = false;
			
 
				+        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+    } else {
			
 
				+        constexpr bool use_logit_softcap = true;
			
 
				+        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
			
 
				+    }
			
 
				 }
			
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -27,7 +27,7 @@
 
				 #include "common.cuh"
			
 
				 #include "fattn-common.cuh"
			
 
				 
			
 
				-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
			
 
				+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				 __launch_bounds__(D, 1)
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
@@ -43,6 +43,7 @@ static __global__ void flash_attn_vec_ext_f16(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -67,6 +68,12 @@ static __global__ void flash_attn_vec_ext_f16(
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				 #ifdef FP16_AVAILABLE
			
 
				+    // Skip unused kernel variants for faster compilation:
			
 
				+    if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        NO_DEVICE_CODE;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     constexpr vec_dot_KQ_f16_t vec_dot_KQ = get_vec_dot_KQ_f16<D>(type_K);
			
@@ -216,6 +223,11 @@ static __global__ void flash_attn_vec_ext_f16(
 
				             for (int j = 0; j < ncols; ++j) {
			
 
				                 half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
			
 
				                 sum = warp_reduce_sum(sum);
			
 
				+
			
 
				+                if (use_logit_softcap) {
			
 
				+                    sum = logit_softcap*tanhf(sum);
			
 
				+                }
			
 
				+
			
 
				                 sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
			
 
				 
			
 
				                 if (ncols == 1) {
			
@@ -312,10 +324,10 @@ static __global__ void flash_attn_vec_ext_f16(
 
				 #endif // FP16_AVAILABLE
			
 
				 }
			
 
				 
			
 
				-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
			
 
				+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
			
 
				 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     constexpr int nwarps = D/WARP_SIZE;
			
 
				-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V>;
			
 
				+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
			
 
				     constexpr bool need_f16_K = D != 128;
			
 
				     constexpr bool need_f16_V = D != 128 && D != 64;
			
 
				     launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
			
@@ -323,48 +335,81 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
 
				 
			
 
				 template <int D, ggml_type type_K, ggml_type type_V>
			
 
				 void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_tensor * KQV = dst;
			
 
				-    ggml_tensor * Q   = dst->src[0];
			
 
				-    ggml_tensor * K   = dst->src[1];
			
 
				-    ggml_tensor * V   = dst->src[2];
			
 
				+    const ggml_tensor * KQV = dst;
			
 
				+    const ggml_tensor * Q   = dst->src[0];
			
 
				+    const ggml_tensor * K   = dst->src[1];
			
 
				+    const ggml_tensor * V   = dst->src[2];
			
 
				 
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				+    const int32_t precision = KQV->op_params[3];
			
 
				     GGML_ASSERT(precision == GGML_PREC_DEFAULT);
			
 
				 
			
 
				     GGML_ASSERT(K->type == type_K);
			
 
				     GGML_ASSERT(V->type == type_V);
			
 
				 
			
 
				+    float logit_softcap;
			
 
				+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				     if (Q->ne[1] == 1) {
			
 
				         constexpr int cols_per_block  = 1;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] == 2) {
			
 
				         constexpr int cols_per_block  = 2;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 4) {
			
 
				         constexpr int cols_per_block  = 4;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 8) {
			
 
				         constexpr int cols_per_block  = 8;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     constexpr int cols_per_block  = 8;
			
 
				     constexpr int parallel_blocks = 1;
			
 
				-    ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+    if (logit_softcap == 0.0f) {
			
 
				+        constexpr bool use_logit_softcap = false;
			
 
				+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+    } else {
			
 
				+        constexpr bool use_logit_softcap = true;
			
 
				+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 #define DECL_FATTN_VEC_F16_CASE(D, type_K, type_V)                          \
			
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -27,7 +27,7 @@
 
				 #include "common.cuh"
			
 
				 #include "fattn-common.cuh"
			
 
				 
			
 
				-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V> // D == head size
			
 
				+template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				 __launch_bounds__(D, 1)
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
@@ -43,6 +43,7 @@ static __global__ void flash_attn_vec_ext_f32(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -66,6 +67,12 @@ static __global__ void flash_attn_vec_ext_f32(
 
				         const int ne1,
			
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				+    // Skip unused kernel variants for faster compilation:
			
 
				+    if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        NO_DEVICE_CODE;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     constexpr vec_dot_KQ_f32_t vec_dot_KQ = get_vec_dot_KQ_f32<D>(type_K);
			
@@ -206,6 +213,11 @@ static __global__ void flash_attn_vec_ext_f32(
 
				             for (int j = 0; j < ncols; ++j) {
			
 
				                 float sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_f2[j], Q_i32[j], Q_ds[j]);
			
 
				                 sum = warp_reduce_sum(sum);
			
 
				+
			
 
				+                if (use_logit_softcap) {
			
 
				+                    sum = logit_softcap*tanhf(sum);
			
 
				+                }
			
 
				+
			
 
				                 sum += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
			
 
				 
			
 
				                 kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum);
			
@@ -293,10 +305,10 @@ static __global__ void flash_attn_vec_ext_f32(
 
				     }
			
 
				 }
			
 
				 
			
 
				-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V>
			
 
				+template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
			
 
				 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     constexpr int nwarps = D/WARP_SIZE;
			
 
				-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V>;
			
 
				+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
			
 
				     constexpr bool need_f16_K = D != 128;
			
 
				     constexpr bool need_f16_V = D != 128 && D != 64;
			
 
				     launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, need_f16_K, need_f16_V);
			
@@ -304,44 +316,78 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
 
				 
			
 
				 template <int D, ggml_type type_K, ggml_type type_V>
			
 
				 void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_tensor * Q   = dst->src[0];
			
 
				-    ggml_tensor * K   = dst->src[1];
			
 
				-    ggml_tensor * V   = dst->src[2];
			
 
				+    const ggml_tensor * KQV = dst;
			
 
				+    const ggml_tensor * Q   = dst->src[0];
			
 
				+    const ggml_tensor * K   = dst->src[1];
			
 
				+    const ggml_tensor * V   = dst->src[2];
			
 
				 
			
 
				     GGML_ASSERT(K->type == type_K);
			
 
				     GGML_ASSERT(V->type == type_V);
			
 
				 
			
 
				+    float logit_softcap;
			
 
				+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				     if (Q->ne[1] == 1) {
			
 
				         constexpr int cols_per_block  = 1;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] == 2) {
			
 
				         constexpr int cols_per_block  = 2;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 4) {
			
 
				         constexpr int cols_per_block  = 4;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     if (Q->ne[1] <= 8) {
			
 
				         constexpr int cols_per_block  = 8;
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				     constexpr int cols_per_block  = 8;
			
 
				     constexpr int parallel_blocks = 1;
			
 
				-    ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V>(ctx, dst);
			
 
				+    if (logit_softcap == 0.0f) {
			
 
				+        constexpr bool use_logit_softcap = false;
			
 
				+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+    } else {
			
 
				+        constexpr bool use_logit_softcap = true;
			
 
				+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 #define DECL_FATTN_VEC_F32_CASE(D, type_K, type_V)                          \
			
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -32,7 +32,7 @@
 
				 #endif // FP16_MMA_AVAILABLE
			
 
				 
			
 
				 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
			
 
				-template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
			
 
				+template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
			
 
				 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
 
				 __launch_bounds__(nwarps*WARP_SIZE, 1)
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
			
@@ -48,6 +48,7 @@ static __global__ void flash_attn_ext_f16(
 
				         const float m0,
			
 
				         const float m1,
			
 
				         const uint32_t n_head_log2,
			
 
				+        const float logit_softcap,
			
 
				         const int ne00,
			
 
				         const int ne01,
			
 
				         const int ne02,
			
@@ -72,6 +73,12 @@ static __global__ void flash_attn_ext_f16(
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				 #ifdef FP16_MMA_AVAILABLE
			
 
				+    // Skip unused kernel variants for faster compilation:
			
 
				+    if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				+        NO_DEVICE_CODE;
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
			
@@ -111,6 +118,8 @@ static __global__ void flash_attn_ext_f16(
 
				     const half  slopeh = __float2half(slopef);
			
 
				     const half2 slope2 = make_half2(slopef, slopef);
			
 
				 
			
 
				+    const half2 logit_softcap_2 = make_half2(logit_softcap, logit_softcap);
			
 
				+
			
 
				     frag_b Q_b[D/16][ncols/frag_n];
			
 
				 
			
 
				     // A single buffer for temporarily holding tiles of KQ and VKQ parts:
			
@@ -220,6 +229,10 @@ static __global__ void flash_attn_ext_f16(
 
				                     const int k = k0 + threadIdx.x;
			
 
				 
			
 
				                     KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k];
			
 
				+
			
 
				+                    if (use_logit_softcap) {
			
 
				+                        KQ_f_tmp[k0/WARP_SIZE] = logit_softcap*tanhf(KQ_f_tmp[k0/WARP_SIZE]);
			
 
				+                    }
			
 
				                 }
			
 
				 
			
 
				                 float KQ_max_new = KQ_max_f[j0/nwarps];
			
@@ -263,6 +276,15 @@ static __global__ void flash_attn_ext_f16(
 
				                     const int k = k0 + threadIdx.x;
			
 
				 
			
 
				                     KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k];
			
 
				+
			
 
				+                    if (use_logit_softcap) {
			
 
				+                        // There is no dedicated tangens hyperbolicus function for half2.
			
 
				+                        KQ2_tmp[k0/WARP_SIZE] = h2exp(KQ2_tmp[k0/WARP_SIZE]*make_half2(2.0f, 2.0f));
			
 
				+                        KQ2_tmp[k0/WARP_SIZE] = (KQ2_tmp[k0/WARP_SIZE] - make_half2(1.0f, 1.0f))
			
 
				+                                               /(KQ2_tmp[k0/WARP_SIZE] + make_half2(1.0f, 1.0f));
			
 
				+
			
 
				+                        KQ2_tmp[k0/WARP_SIZE] *= logit_softcap_2;
			
 
				+                    }
			
 
				                 }
			
 
				 
			
 
				                 half2 KQ_max_new = KQ_max_h2[j0/nwarps];
			
@@ -453,7 +475,8 @@ static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 
				 
			
 
				 template <int D, int cols_per_block, typename KQ_acc_t>
			
 
				 void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * Q = dst->src[0];
			
 
				+    const ggml_tensor * KQV = dst;
			
 
				+    const ggml_tensor * Q   = dst->src[0];
			
 
				 
			
 
				     constexpr int nwarps = 4;
			
 
				 
			
@@ -461,20 +484,50 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
 
				     const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
			
 
				     const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
			
 
				 
			
 
				+    float logit_softcap;
			
 
				+    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
			
 
				+
			
 
				     if (4*blocks_num_pb1 < 2*nsm) {
			
 
				         constexpr int parallel_blocks = 4;
			
 
				-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
			
 
				+        fattn_kernel_t fattn_kernel;
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            fattn_kernel = flash_attn_ext_f16<
			
 
				+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            fattn_kernel = flash_attn_ext_f16<
			
 
				+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+        }
			
 
				         launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         return;
			
 
				     }
			
 
				     if (2*blocks_num_pb1 < 2*nsm) {
			
 
				         constexpr int parallel_blocks = 2;
			
 
				-        fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
			
 
				+        fattn_kernel_t fattn_kernel;
			
 
				+        if (logit_softcap == 0.0f) {
			
 
				+            constexpr bool use_logit_softcap = false;
			
 
				+            fattn_kernel = flash_attn_ext_f16<
			
 
				+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+        } else {
			
 
				+            constexpr bool use_logit_softcap = true;
			
 
				+            fattn_kernel = flash_attn_ext_f16<
			
 
				+                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+        }
			
 
				         launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				         return;
			
 
				     }
			
 
				     constexpr int parallel_blocks = 1;
			
 
				-    fattn_kernel_t fattn_kernel = flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>;
			
 
				+    fattn_kernel_t fattn_kernel;
			
 
				+    if (logit_softcap == 0.0f) {
			
 
				+        constexpr bool use_logit_softcap = false;
			
 
				+        fattn_kernel = flash_attn_ext_f16<
			
 
				+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+    } else {
			
 
				+        constexpr bool use_logit_softcap = true;
			
 
				+        fattn_kernel = flash_attn_ext_f16<
			
 
				+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
			
 
				+    }
			
 
				     launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
			
 
				 }
			
 
				 
			
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -39,7 +39,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
 
				     const ggml_tensor * KQV = dst;
			
 
				     const ggml_tensor * Q   = dst->src[0];
			
 
				 
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				+    const int32_t precision = KQV->op_params[3];
			
 
				 
			
 
				     if (precision != GGML_PREC_DEFAULT) {
			
 
				         if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
			
@@ -327,7 +327,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
 
				 
			
 
				     ggml_cuda_set_device(ctx.device);
			
 
				     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
			
 
				-    const int32_t precision = KQV->op_params[2];
			
 
				+    const int32_t precision = KQV->op_params[3];
			
 
				 
			
 
				     // On AMD the tile kernels perform poorly, use the vec kernel instead:
			
 
				     if (cc >= CC_OFFSET_AMD) {
			
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -252,7 +252,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
			
 
				     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
			
 
				 
			
 
				-    const bool is_neox = mode & 2;
			
 
				+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
			
 
				 
			
 
				     const int32_t * pos = (const int32_t *) src1_d;
			
 
				 
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -42,7 +42,7 @@ static __global__ void k_sum_rows_f32(const float * x, float * dst, const int nc
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				     const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				     const dim3 block_nums(nrows, 1, 1);
			
 
				     k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
@@ -58,7 +58,6 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				     GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				 
			
 
				-
			
 
				     const int64_t ncols = src0->ne[0];
			
 
				     const int64_t nrows = ggml_nrows(src0);
			
 
				 
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -26,4 +26,6 @@
 
				 
			
 
				 #include "common.cuh"
			
 
				 
			
 
				+void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
			
 
				+
			
 
				 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *