9 月之前 · 80db43b7b4
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -710,14 +710,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
 
				     }
			
 
				     if (arg == "--lora") {
			
 
				         CHECK_ARG
			
 
				-        params.lora_adapter.emplace_back(argv[i], 1.0f);
			
 
				+        params.lora_adapters.push_back({
			
 
				+            std::string(argv[i]),
			
 
				+            1.0,
			
 
				+        });
			
 
				         return true;
			
 
				     }
			
 
				     if (arg == "--lora-scaled") {
			
 
				         CHECK_ARG
			
 
				-        const char* lora_adapter = argv[i];
			
 
				+        std::string lora_adapter = argv[i];
			
 
				         CHECK_ARG
			
 
				-        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
			
 
				+        params.lora_adapters.push_back({
			
 
				+            lora_adapter,
			
 
				+            std::stof(argv[i]),
			
 
				+        });
			
 
				+        return true;
			
 
				+    }
			
 
				+    if (arg == "--lora-init-without-apply") {
			
 
				+        params.lora_init_without_apply = true;
			
 
				         return true;
			
 
				     }
			
 
				     if (arg == "--control-vector") {
			
@@ -1660,7 +1670,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				     options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
			
 
				     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
			
 
				     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
			
 
				-    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
			
 
				+    options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
			
 
				     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
			
 
				     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
			
 
				     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
			
@@ -1680,6 +1690,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
 
				                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
			
 
				     options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
			
 
				                                                                         "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
			
 
				+    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
			
 
				 
			
 
				 #ifndef LOG_DISABLE_LOGS
			
 
				     options.push_back({ "logging" });
			
@@ -2065,8 +2076,8 @@ std::string fs_get_cache_file(const std::string & filename) {
 
				 //
			
 
				 // Model utils
			
 
				 //
			
 
				-
			
 
				-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
			
 
				+struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
			
 
				+    llama_init_result iparams;
			
 
				     auto mparams = llama_model_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model = nullptr;
			
@@ -2081,7 +2092,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				 
			
 
				     if (model == NULL) {
			
 
				         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
			
 
				-        return std::make_tuple(nullptr, nullptr);
			
 
				+        return iparams;
			
 
				     }
			
 
				 
			
 
				     auto cparams = llama_context_params_from_gpt_params(params);
			
@@ -2090,7 +2101,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				     if (lctx == NULL) {
			
 
				         fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
			
 
				         llama_free_model(model);
			
 
				-        return std::make_tuple(nullptr, nullptr);
			
 
				+        return iparams;
			
 
				     }
			
 
				 
			
 
				     if (!params.control_vectors.empty()) {
			
@@ -2101,7 +2112,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				         if (cvec.n_embd == -1) {
			
 
				             llama_free(lctx);
			
 
				             llama_free_model(model);
			
 
				-            return std::make_tuple(nullptr, nullptr);
			
 
				+            return iparams;
			
 
				         }
			
 
				 
			
 
				         int err = llama_control_vector_apply(lctx,
			
@@ -2113,34 +2124,38 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				         if (err) {
			
 
				             llama_free(lctx);
			
 
				             llama_free_model(model);
			
 
				-            return std::make_tuple(nullptr, nullptr);
			
 
				+            return iparams;
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
			
 
				-        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
			
 
				-        float lora_scale = std::get<1>(params.lora_adapter[i]);
			
 
				-
			
 
				-        // try to load as gguf
			
 
				-        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
			
 
				-        if (adapter == nullptr) {
			
 
				-            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
			
 
				+    // load and optionally apply lora adapters
			
 
				+    for (auto & la : params.lora_adapters) {
			
 
				+        llama_lora_adapter_container loaded_la;
			
 
				+        loaded_la.path = la.path;
			
 
				+        loaded_la.scale = la.scale;
			
 
				+        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
			
 
				+        if (loaded_la.adapter == nullptr) {
			
 
				+            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
			
 
				 
			
 
				             // if that fails, try loading as ggla for compatibility
			
 
				             int err = llama_model_apply_lora_from_file(model,
			
 
				-                                                    lora_adapter.c_str(),
			
 
				-                                                    lora_scale,
			
 
				+                                                    la.path.c_str(),
			
 
				+                                                    la.scale,
			
 
				                                                     nullptr,
			
 
				                                                     params.n_threads);
			
 
				             if (err != 0) {
			
 
				                 fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
			
 
				                 llama_free(lctx);
			
 
				                 llama_free_model(model);
			
 
				-                return std::make_tuple(nullptr, nullptr);
			
 
				+                return iparams;
			
 
				+            } else {
			
 
				+                break;
			
 
				             }
			
 
				-        } else {
			
 
				-            llama_lora_adapter_set(lctx, adapter, lora_scale);
			
 
				         }
			
 
				+        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
			
 
				+    }
			
 
				+    if (!params.lora_init_without_apply) {
			
 
				+        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
			
 
				     }
			
 
				 
			
 
				     if (params.ignore_eos) {
			
@@ -2174,7 +2189,18 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				         llama_reset_timings(lctx);
			
 
				     }
			
 
				 
			
 
				-    return std::make_tuple(model, lctx);
			
 
				+    iparams.model   = model;
			
 
				+    iparams.context = lctx;
			
 
				+    return iparams;
			
 
				+}
			
 
				+
			
 
				+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
			
 
				+    llama_lora_adapter_clear(ctx);
			
 
				+    for (auto & la : lora_adapters) {
			
 
				+        if (la.scale != 0.0f) {
			
 
				+            llama_lora_adapter_set(ctx, la.adapter, la.scale);
			
 
				+        }
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
			
@@ -3199,19 +3225,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
 
				     }
			
 
				 
			
 
				     fprintf(stream, "lora:\n");
			
 
				-    for (std::tuple<std::string, float> la : params.lora_adapter) {
			
 
				-        if (std::get<1>(la) != 1.0f) {
			
 
				-            continue;
			
 
				+    for (auto & la : params.lora_adapters) {
			
 
				+        if (la.scale == 1.0f) {
			
 
				+            fprintf(stream, "  - %s\n", la.path.c_str());
			
 
				         }
			
 
				-        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
			
 
				     }
			
 
				     fprintf(stream, "lora_scaled:\n");
			
 
				-    for (std::tuple<std::string, float> la : params.lora_adapter) {
			
 
				-        if (std::get<1>(la) == 1.0f) {
			
 
				-            continue;
			
 
				+    for (auto & la : params.lora_adapters) {
			
 
				+        if (la.scale != 1.0f) {
			
 
				+            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
			
 
				         }
			
 
				-        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
			
 
				     }
			
 
				+    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
			
 
				     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
			
 
				     fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
			
 
				     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
			
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -59,6 +59,15 @@
 
				 
			
 
				 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
			
 
				 
			
 
				+struct llama_lora_adapter_info {
			
 
				+    std::string path;
			
 
				+    float scale;
			
 
				+};
			
 
				+
			
 
				+struct llama_lora_adapter_container : llama_lora_adapter_info {
			
 
				+    struct llama_lora_adapter * adapter;
			
 
				+};
			
 
				+
			
 
				 // build info
			
 
				 extern int LLAMA_BUILD_NUMBER;
			
 
				 extern char const * LLAMA_COMMIT;
			
@@ -152,8 +161,8 @@ struct gpt_params {
 
				     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
			
 
				     std::vector<llama_model_kv_override> kv_overrides;
			
 
				 
			
 
				-    // TODO: avoid tuple, use struct
			
 
				-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
			
 
				+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
			
 
				+    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
			
 
				 
			
 
				     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
			
 
				 
			
@@ -334,8 +343,13 @@ std::string fs_get_cache_file(const std::string & filename);
 
				 // Model utils
			
 
				 //
			
 
				 
			
 
				-// TODO: avoid tuplue, use struct
			
 
				-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
			
 
				+struct llama_init_result {
			
 
				+    struct llama_model   * model   = nullptr;
			
 
				+    struct llama_context * context = nullptr;
			
 
				+    std::vector<llama_lora_adapter_container> lora_adapters;
			
 
				+};
			
 
				+
			
 
				+struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
			
 
				 
			
 
				 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
			
 
				 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
			
@@ -343,6 +357,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
				 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
			
 
				 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
			
 
				 
			
 
				+// clear LoRA adapters from context, then apply new list of adapters
			
 
				+void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
			
 
				+
			
 
				 // Batch utils
			
 
				 
			
 
				 void llama_batch_clear(struct llama_batch & batch);
			
--- a/llama/ggml-aarch64.c
+++ b/llama/ggml-aarch64.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -410,8 +410,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE)
			
 
				-    if (svcntw() == 8) {
			
 
				-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
			
 
				     }
			
 
				 #endif
			
@@ -522,8 +522,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE)
			
 
				-    if (svcntw() == 8) {
			
 
				-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
			
 
				     }
			
 
				 #endif
			
@@ -640,7 +640,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
			
 
				-    if (svcntw() == 8) {
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				         const void * b_ptr = vx;
			
 
				         const void * a_ptr = vy;
			
 
				         float * res_ptr = s;
			
@@ -706,12 +706,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				         return;
			
 
				     }
			
 
				     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
			
 
				-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
			
 
				                     "performance");
			
 
				     }
			
 
				     else if (ggml_cpu_has_neon()) {
			
 
				-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
			
 
				+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
			
 
				                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
			
 
				                     "quantization format for optimal performance");
			
 
				     }
			
@@ -771,8 +771,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
			
 
				-    if (svcntw() == 8) {
			
 
				-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
			
 
				     }
			
 
				 #endif
			
@@ -1292,8 +1292,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
			
 
				-    if (svcntw() == 8) {
			
 
				-        GGML_ASSERT(!(ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
			
 
				     }
			
 
				 #endif
			
@@ -1754,7 +1754,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				     UNUSED(blocklen);
			
 
				 
			
 
				 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
			
 
				-    if (svcntw() == 8) {
			
 
				+    if (ggml_sve_cnt_b == QK8_0) {
			
 
				         const void * b_ptr = vx;
			
 
				         const void * a_ptr = vy;
			
 
				         float * res_ptr = s;
			
@@ -2165,12 +2165,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
 
				         return;
			
 
				     }
			
 
				     else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
			
 
				-        GGML_ASSERT((ggml_cpu_has_sve() && (svcntw() == 8)) &&
			
 
				+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
			
 
				                     "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
			
 
				                     "performance");
			
 
				     }
			
 
				     else if (ggml_cpu_has_neon()) {
			
 
				-        GGML_ASSERT(((ggml_cpu_has_sve() && (svcntw() == 8)) || ggml_cpu_has_matmul_int8()) &&
			
 
				+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
			
 
				                     "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
			
 
				                     "quantization format for optimal performance");
			
 
				     }
			
--- a/llama/ggml-aarch64.h
+++ b/llama/ggml-aarch64.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -156,7 +156,22 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 
				     }
			
 
				     return res;
			
 
				 #else
			
 
				+
			
 
				+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
			
 
				+    cudaError_t err;
			
 
				+    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
			
 
				+    {
			
 
				+        err = cudaMallocManaged(ptr, size);
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        err = cudaMalloc(ptr, size);
			
 
				+    }
			
 
				+    return err;
			
 
				+#else
			
 
				     return cudaMalloc(ptr, size);
			
 
				+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
			
 
				+
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -1516,7 +1531,7 @@ static void ggml_cuda_op_mul_mat(
 
				         }
			
 
				 
			
 
				         // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
			
 
				-        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
			
 
				+        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
			
 
				             const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
			
 
				             const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
			
 
				             CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
			
@@ -1915,10 +1930,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 
				 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
			
 
				     const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
			
 
				 
			
 
				-    bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
			
 
				+    bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
			
 
				         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
			
 
				-        && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[0] >= GGML_CUDA_DMMV_X*2
			
 
				-        && src1->ne[1] == 1;
			
 
				+        && src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
			
 
				     bool          use_mul_mat_vec_q =  ggml_is_quantized(src0->type)
			
 
				         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
			
 
				         && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/alibi.cu
+++ b/llama/ggml-cuda/alibi.cu
@@ -1,3 +1,29 @@
 
				+/**
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				 /**
			
 
				  * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				  *
			
--- a/llama/ggml-cuda/alibi.cuh
+++ b/llama/ggml-cuda/alibi.cuh
@@ -1,3 +1,29 @@
 
				+/**
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				 /**
			
 
				  * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -53,255 +53,11 @@
 
				 #include <vector>
			
 
				 
			
 
				 #if defined(GGML_USE_HIPBLAS)
			
 
				-#include <hip/hip_runtime.h>
			
 
				-#include <hipblas/hipblas.h>
			
 
				-#include <hip/hip_fp16.h>
			
 
				-#ifdef __HIP_PLATFORM_AMD__
			
 
				-// for rocblas_initialize()
			
 
				-#include "rocblas/rocblas.h"
			
 
				-#endif // __HIP_PLATFORM_AMD__
			
 
				-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
			
 
				-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
			
 
				-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
			
 
				-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
			
 
				-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
			
 
				-#define CUBLAS_OP_N HIPBLAS_OP_N
			
 
				-#define CUBLAS_OP_T HIPBLAS_OP_T
			
 
				-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
			
 
				-#define CUBLAS_TF32_TENSOR_OP_MATH 0
			
 
				-#define CUDA_R_16F  HIPBLAS_R_16F
			
 
				-#define CUDA_R_32F  HIPBLAS_R_32F
			
 
				-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
			
 
				-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
			
 
				-#define cublasCreate hipblasCreate
			
 
				-#define cublasDestroy hipblasDestroy
			
 
				-#define cublasGemmEx hipblasGemmEx
			
 
				-#define cublasGemmBatchedEx hipblasGemmBatchedEx
			
 
				-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
			
 
				-#define cublasHandle_t hipblasHandle_t
			
 
				-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
			
 
				-#define cublasSetStream hipblasSetStream
			
 
				-#define cublasSgemm hipblasSgemm
			
 
				-#define cublasStatus_t hipblasStatus_t
			
 
				-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
			
 
				-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
			
 
				-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
			
 
				-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
			
 
				-#define cudaDeviceProp hipDeviceProp_t
			
 
				-#define cudaDeviceSynchronize hipDeviceSynchronize
			
 
				-#define cudaError_t hipError_t
			
 
				-#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
			
 
				-#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
			
 
				-#define cudaEventCreateWithFlags hipEventCreateWithFlags
			
 
				-#define cudaEventDisableTiming hipEventDisableTiming
			
 
				-#define cudaEventRecord hipEventRecord
			
 
				-#define cudaEventSynchronize hipEventSynchronize
			
 
				-#define cudaEvent_t hipEvent_t
			
 
				-#define cudaEventDestroy hipEventDestroy
			
 
				-#define cudaFree hipFree
			
 
				-#define cudaFreeHost hipHostFree
			
 
				-#define cudaGetDevice hipGetDevice
			
 
				-#define cudaGetDeviceCount hipGetDeviceCount
			
 
				-#define cudaGetDeviceProperties hipGetDeviceProperties
			
 
				-#define cudaGetErrorString hipGetErrorString
			
 
				-#define cudaGetLastError hipGetLastError
			
 
				-#define cudaHostRegister hipHostRegister
			
 
				-#define cudaHostRegisterPortable hipHostRegisterPortable
			
 
				-#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
			
 
				-#define cudaHostUnregister hipHostUnregister
			
 
				-#define cudaLaunchHostFunc hipLaunchHostFunc
			
 
				-#define cudaMalloc hipMalloc
			
 
				-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
			
 
				-#define cudaMemcpy hipMemcpy
			
 
				-#define cudaMemcpyAsync hipMemcpyAsync
			
 
				-#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
			
 
				-#define cudaMemcpy2DAsync hipMemcpy2DAsync
			
 
				-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
			
 
				-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
			
 
				-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
			
 
				-#define cudaMemcpyKind hipMemcpyKind
			
 
				-#define cudaMemset hipMemset
			
 
				-#define cudaMemsetAsync hipMemsetAsync
			
 
				-#define cudaMemGetInfo hipMemGetInfo
			
 
				-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
			
 
				-#define cudaSetDevice hipSetDevice
			
 
				-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
			
 
				-#define cudaStreamDestroy hipStreamDestroy
			
 
				-#define cudaStreamFireAndForget hipStreamFireAndForget
			
 
				-#define cudaStreamNonBlocking hipStreamNonBlocking
			
 
				-#define cudaStreamPerThread hipStreamPerThread
			
 
				-#define cudaStreamSynchronize hipStreamSynchronize
			
 
				-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
			
 
				-#define cudaStream_t hipStream_t
			
 
				-#define cudaSuccess hipSuccess
			
 
				-#define __trap() do { abort(); __builtin_unreachable(); } while(0)
			
 
				-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
			
 
				-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
			
 
				-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
			
 
				-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
			
 
				-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
			
 
				-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
			
 
				-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
			
 
				-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
			
 
				-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
			
 
				+#include "vendors/hip.h"
			
 
				 #elif defined(GGML_USE_MUSA)
			
 
				-#include <musa_runtime.h>
			
 
				-#include <musa.h>
			
 
				-#include <mublas.h>
			
 
				-#include <musa_fp16.h>
			
 
				-// XXX: Keep the following order the same as hipBLAS
			
 
				-// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
			
 
				-// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
			
 
				-#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
			
 
				-#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
			
 
				-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
			
 
				-#define CUBLAS_OP_N MUBLAS_OP_N
			
 
				-#define CUBLAS_OP_T MUBLAS_OP_T
			
 
				-#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
			
 
				-// #define CUBLAS_TF32_TENSOR_OP_MATH 0
			
 
				-#define CUDA_R_16F  MUSA_R_16F
			
 
				-#define CUDA_R_32F  MUSA_R_32F
			
 
				-// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
			
 
				-// #define cublasComputeType_t mublasComputeType_t
			
 
				-#define cublasCreate mublasCreate
			
 
				-#define cublasDestroy mublasDestroy
			
 
				-#define cublasGemmEx mublasGemmEx
			
 
				-#define cublasGemmBatchedEx mublasGemmBatchedEx
			
 
				-#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
			
 
				-#define cublasHandle_t mublasHandle_t
			
 
				-// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
			
 
				-#define cublasSetMathMode mublasSetMathMode
			
 
				-#define cublasSetStream mublasSetStream
			
 
				-#define cublasSgemm mublasSgemm
			
 
				-#define cublasStatus_t mublasStatus_t
			
 
				-#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
			
 
				-#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
			
 
				-#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
			
 
				-#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
			
 
				-#define cudaDeviceProp musaDeviceProp
			
 
				-#define cudaDeviceSynchronize musaDeviceSynchronize
			
 
				-#define cudaError_t musaError_t
			
 
				-#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
			
 
				-#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
			
 
				-#define cudaEventCreateWithFlags musaEventCreateWithFlags
			
 
				-#define cudaEventDisableTiming musaEventDisableTiming
			
 
				-#define cudaEventRecord musaEventRecord
			
 
				-#define cudaEventSynchronize musaEventSynchronize
			
 
				-#define cudaEvent_t musaEvent_t
			
 
				-#define cudaEventDestroy musaEventDestroy
			
 
				-#define cudaFree musaFree
			
 
				-#define cudaFreeHost musaFreeHost
			
 
				-#define cudaGetDevice musaGetDevice
			
 
				-#define cudaGetDeviceCount musaGetDeviceCount
			
 
				-#define cudaGetDeviceProperties musaGetDeviceProperties
			
 
				-#define cudaGetErrorString musaGetErrorString
			
 
				-#define cudaGetLastError musaGetLastError
			
 
				-#define cudaHostRegister musaHostRegister
			
 
				-#define cudaHostRegisterPortable musaHostRegisterPortable
			
 
				-#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
			
 
				-#define cudaHostUnregister musaHostUnregister
			
 
				-#define cudaLaunchHostFunc musaLaunchHostFunc
			
 
				-#define cudaMalloc musaMalloc
			
 
				-#define cudaMallocHost musaMallocHost
			
 
				-#define cudaMemcpy musaMemcpy
			
 
				-#define cudaMemcpyAsync musaMemcpyAsync
			
 
				-#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
			
 
				-#define cudaMemcpy2DAsync musaMemcpy2DAsync
			
 
				-#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
			
 
				-#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
			
 
				-#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
			
 
				-#define cudaMemcpyKind musaMemcpyKind
			
 
				-#define cudaMemset musaMemset
			
 
				-#define cudaMemsetAsync musaMemsetAsync
			
 
				-#define cudaMemGetInfo musaMemGetInfo
			
 
				-#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
			
 
				-#define cudaSetDevice musaSetDevice
			
 
				-#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
			
 
				-#define cudaStreamDestroy musaStreamDestroy
			
 
				-#define cudaStreamFireAndForget musaStreamFireAndForget
			
 
				-#define cudaStreamNonBlocking musaStreamNonBlocking
			
 
				-#define cudaStreamPerThread musaStreamPerThread
			
 
				-#define cudaStreamSynchronize musaStreamSynchronize
			
 
				-#define cudaStreamWaitEvent musaStreamWaitEvent
			
 
				-#define cudaStream_t musaStream_t
			
 
				-#define cudaSuccess musaSuccess
			
 
				-
			
 
				-// XXX: Other CUDA => MUSA mapping
			
 
				-#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
			
 
				-#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
			
 
				-#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
			
 
				-#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
			
 
				-#define CUdevice MUdevice
			
 
				-#define CUdeviceptr MUdeviceptr
			
 
				-#define CUmemAccessDesc MUmemAccessDesc
			
 
				-#define CUmemAllocationProp MUmemAllocationProp
			
 
				-#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
			
 
				-#define cuDeviceGet muDeviceGet
			
 
				-#define cuDeviceGetAttribute muDeviceGetAttribute
			
 
				-#define cuMemAddressFree muMemAddressFree
			
 
				-#define cuMemAddressReserve muMemAddressReserve
			
 
				-#define cuMemCreate muMemCreate
			
 
				-#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
			
 
				-#define cuMemMap muMemMap
			
 
				-#define cuMemRelease muMemRelease
			
 
				-#define cuMemSetAccess muMemSetAccess
			
 
				-#define cuMemUnmap muMemUnmap
			
 
				-#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
			
 
				-#define cudaFuncSetAttribute musaFuncSetAttribute
			
 
				-#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
			
 
				-#define make_cudaExtent make_musaExtent
			
 
				-#define make_cudaPitchedPtr make_musaPitchedPtr
			
 
				-
			
 
				-// XXX: USE_CUDA_GRAPH
			
 
				-#define CUDA_SUCCESS MUSA_SUCCESS
			
 
				-#define CUresult MUresult
			
 
				-#define cuGetErrorString muGetErrorString
			
 
				-#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
			
 
				-#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
			
 
				-#define cudaGraphDestroy musaGraphDestroy
			
 
				-#define cudaGraphExecDestroy musaGraphExecDestroy
			
 
				-#define cudaGraphExec_t musaGraphExec_t
			
 
				-#define cudaGraphExecUpdate musaGraphExecUpdate
			
 
				-#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
			
 
				-#define cudaGraphGetNodes musaGraphGetNodes
			
 
				-#define cudaGraphInstantiate musaGraphInstantiate
			
 
				-#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
			
 
				-#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
			
 
				-#define cudaGraphLaunch musaGraphLaunch
			
 
				-#define cudaGraphNodeGetType musaGraphNodeGetType
			
 
				-#define cudaGraphNode_t musaGraphNode_t
			
 
				-#define cudaGraphNodeType musaGraphNodeType
			
 
				-#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
			
 
				-#define cudaGraph_t musaGraph_t
			
 
				-#define cudaKernelNodeParams musaKernelNodeParams
			
 
				-#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
			
 
				-#define cudaStreamEndCapture musaStreamEndCapture
			
 
				-
			
 
				-// XXX: cuBLAS => muBLAS mapping
			
 
				-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
			
 
				-#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
			
 
				-#define CUBLAS_COMPUTE_16F CUDA_R_16F
			
 
				-#define CUBLAS_COMPUTE_32F CUDA_R_32F
			
 
				-#define cublasComputeType_t cudaDataType_t
			
 
				-
			
 
				-// XXX: Clang builtins mapping
			
 
				-#define __vsub4   __vsub4_musa
			
 
				-#define __vcmpeq4 __vcmpeq4_musa
			
 
				-#define __vcmpne4 __vcmpne4_musa
			
 
				+#include "vendors/musa.h"
			
 
				 #else
			
 
				-#include <cuda_runtime.h>
			
 
				-#include <cuda.h>
			
 
				-#include <cublas_v2.h>
			
 
				-#include <cuda_fp16.h>
			
 
				-
			
 
				-#if CUDART_VERSION < 11020
			
 
				-#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
			
 
				-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
			
 
				-#define CUBLAS_COMPUTE_16F CUDA_R_16F
			
 
				-#define CUBLAS_COMPUTE_32F CUDA_R_32F
			
 
				-#define cublasComputeType_t cudaDataType_t
			
 
				-#endif // CUDART_VERSION < 11020
			
 
				-
			
 
				+#include "vendors/cuda.h"
			
 
				 #endif // defined(GGML_USE_HIPBLAS)
			
 
				 
			
 
				 #define STRINGIZE_IMPL(...) #__VA_ARGS__
			
@@ -344,11 +100,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
 
				 
			
 
				 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
			
 
				     static const char * cublas_get_error_str(const cublasStatus_t err) {
			
 
				-#ifndef GGML_USE_MUSA
			
 
				         return cublasGetStatusString(err);
			
 
				-#else
			
 
				-        return mublasStatus_to_string(err);
			
 
				-#endif // GGML_USE_MUSA
			
 
				     }
			
 
				 #else
			
 
				     static const char * cublas_get_error_str(const cublasStatus_t err) {
			
@@ -390,129 +142,7 @@ typedef half2 dfloat2;
 
				 #else
			
 
				 typedef float dfloat; // dequantize float
			
 
				 typedef float2 dfloat2;
			
 
				-#endif //GGML_CUDA_F16
			
 
				-
			
 
				-#if defined(GGML_USE_MUSA)
			
 
				-#ifndef __has_builtin
			
 
				-    #define __has_builtin(x) 0
			
 
				-#endif
			
 
				-
			
 
				-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
			
 
				-
			
 
				-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
			
 
				-    return __vsubss4(a, b);
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
			
 
				-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				-    unsigned int c;
			
 
				-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				-#pragma unroll
			
 
				-    for (int i = 0; i < 4; ++i) {
			
 
				-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
			
 
				-    }
			
 
				-    return c;
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
			
 
				-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				-    unsigned int c;
			
 
				-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				-#pragma unroll
			
 
				-    for (int i = 0; i < 4; ++i) {
			
 
				-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
			
 
				-    }
			
 
				-    return c;
			
 
				-}
			
 
				-#endif // defined(GGML_USE_MUSA)
			
 
				-
			
 
				-#if defined(GGML_USE_HIPBLAS)
			
 
				-#define __CUDA_ARCH__ 1300
			
 
				-
			
 
				-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
			
 
				-    defined(__gfx1150__) || defined(__gfx1151__)
			
 
				-#define RDNA3
			
 
				-#endif
			
 
				-
			
 
				-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
			
 
				-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
			
 
				-#define RDNA2
			
 
				-#endif
			
 
				-
			
 
				-#if defined(__gfx1010__) || defined(__gfx1012__)
			
 
				-#define RDNA1
			
 
				-#endif
			
 
				-
			
 
				-#ifndef __has_builtin
			
 
				-    #define __has_builtin(x) 0
			
 
				-#endif
			
 
				-
			
 
				-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
			
 
				-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
			
 
				-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
			
 
				-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
			
 
				-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
			
 
				-#if __has_builtin(__builtin_elementwise_sub_sat)
			
 
				-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
			
 
				-    return reinterpret_cast<const int &>(c);
			
 
				-#else
			
 
				-    int8x4_t c;
			
 
				-    int16_t tmp;
			
 
				-#pragma unroll
			
 
				-    for (int i = 0; i < 4; i++) {
			
 
				-        tmp = va[i] - vb[i];
			
 
				-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
			
 
				-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
			
 
				-        c[i] = tmp;
			
 
				-    }
			
 
				-    return reinterpret_cast<int &>(c);
			
 
				-#endif // __has_builtin(__builtin_elementwise_sub_sat)
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ int __vsub4(const int a, const int b) {
			
 
				-    return __vsubss4(a, b);
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
			
 
				-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				-    unsigned int c;
			
 
				-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				-#pragma unroll
			
 
				-    for (int i = 0; i < 4; ++i) {
			
 
				-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
			
 
				-    }
			
 
				-    return c;
			
 
				-}
			
 
				-
			
 
				-static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
			
 
				-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				-    unsigned int c;
			
 
				-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				-#pragma unroll
			
 
				-    for (int i = 0; i < 4; ++i) {
			
 
				-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
			
 
				-    }
			
 
				-    return c;
			
 
				-}
			
 
				-
			
 
				-#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
			
 
				-// __shfl_xor() for half2 was added in ROCm 5.6
			
 
				-static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
			
 
				-    typedef union half2_b32 {
			
 
				-        half2 val;
			
 
				-        int   b32;
			
 
				-    } half2_b32_t;
			
 
				-    half2_b32_t tmp;
			
 
				-    tmp.val = var;
			
 
				-    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
			
 
				-    return tmp.val;
			
 
				-}
			
 
				-#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
			
 
				-#endif // defined(GGML_USE_HIPBLAS)
			
 
				+#endif // GGML_CUDA_F16
			
 
				 
			
 
				 #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
			
 
				 #define FP16_AVAILABLE
			
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -526,7 +526,7 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
 
				 }
			
 
				 
			
 
				 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
@@ -536,7 +536,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
 
				 }
			
 
				 
			
 
				 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
 
				     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
@@ -545,7 +545,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
 
				 }
			
 
				 
			
 
				 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
 
				     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
@@ -554,7 +554,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
 
				 }
			
 
				 
			
 
				 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
 
				     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
@@ -563,7 +563,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
 
				 }
			
 
				 
			
 
				 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
 
				     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
@@ -614,7 +614,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 
				 }
			
 
				 
			
 
				 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
			
 
				+    GGML_ASSERT(ncols % (GGML_CUDA_DMMV_X*2) == 0);
			
 
				     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
			
 
				     const dim3 block_nums(block_num_y, 1, 1);
			
 
				     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
			
@@ -698,3 +698,12 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
 
				     GGML_UNUSED(src1_ncols);
			
 
				     GGML_UNUSED(src1_padded_row_size);
			
 
				 }
			
 
				+
			
 
				+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type) {
			
 
				+    return src0_type == GGML_TYPE_Q4_0 || src0_type == GGML_TYPE_Q4_1 ||
			
 
				+        src0_type == GGML_TYPE_Q5_0 || src0_type == GGML_TYPE_Q5_1 ||
			
 
				+        src0_type == GGML_TYPE_Q8_0 || src0_type == GGML_TYPE_Q2_K ||
			
 
				+        src0_type == GGML_TYPE_Q3_K || src0_type == GGML_TYPE_Q4_K ||
			
 
				+        src0_type == GGML_TYPE_Q5_K || src0_type == GGML_TYPE_Q6_K ||
			
 
				+        src0_type == GGML_TYPE_F16;
			
 
				+}
			
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -42,3 +42,5 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
 
				     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				     const int64_t src1_padded_row_size, cudaStream_t stream);
			
 
				+
			
 
				+bool ggml_cuda_dmmv_type_supported(ggml_type src0_type);
			
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -168,8 +168,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
 
				     }
			
 
				 }
			
 
				 
			
 
				-static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) {
			
 
				-    static const float eps = 1e-6f;
			
 
				+static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
			
 
				     if (group_size < 1024) {
			
 
				         const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				         group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
			
@@ -222,8 +221,12 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
 
				     GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				 
			
 
				     int num_groups = dst->op_params[0];
			
 
				+
			
 
				+    float eps;
			
 
				+    memcpy(&eps, dst->op_params + 1, sizeof(float));
			
 
				+
			
 
				     int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
			
 
				-    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream);
			
 
				+    group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
			
 
				 }
			
 
				 
			
 
				 void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *