пре 3 месеци · 1deafd8254
--- a/api/types.go
+++ b/api/types.go
@@ -225,7 +225,6 @@ type Options struct {
 
				 	Mirostat         int      `json:"mirostat,omitempty"`
			
 
				 	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
			
 
				 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
			
 
				-	PenalizeNewline  bool     `json:"penalize_newline,omitempty"`
			
 
				 	Stop             []string `json:"stop,omitempty"`
			
 
				 }
			
 
				 
			
@@ -606,7 +605,6 @@ func DefaultOptions() Options {
 
				 		Mirostat:         0,
			
 
				 		MirostatTau:      5.0,
			
 
				 		MirostatEta:      0.1,
			
 
				-		PenalizeNewline:  true,
			
 
				 		Seed:             -1,
			
 
				 
			
 
				 		Runner: Runner{
			
--- a/llama/amx.cpp
+++ b/llama/amx.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/amx.h
+++ b/llama/amx.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
			
 
				                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
			
 
				                 // stride = 1, padding = 1, bias is nullptr
			
 
				-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
			
 
				+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
			
 
				 
			
 
				                 // layer norm
			
 
				                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
			
@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				             // block_2
			
 
				             {
			
 
				                 // stride = 2
			
 
				-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
			
 
				+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
			
 
				 
			
 
				                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
			
 
				                 // layer norm
			
@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				             // mlp_2 ne [24, 24, 2048, 1]
			
 
				             mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
			
 
				             // weight ne = [3, 3, 2048, 1]
			
 
				-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
			
 
				+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
			
 
				             peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
			
 
				             peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
			
 
				             mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
			
@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				     }
			
 
				 
			
 
				 #ifdef GGML_USE_CUDA
			
 
				-    new_clip->backend = ggml_backend_cuda_init(0);
			
 
				-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
			
 
				+   new_clip->backend = ggml_backend_cuda_init(0);
			
 
				+   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_METAL
			
 
				-    new_clip->backend = ggml_backend_metal_init();
			
 
				-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
			
 
				+   new_clip->backend = ggml_backend_metal_init();
			
 
				+   LOG_INF("%s: CLIP using Metal backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_CANN
			
 
				-    new_clip->backend = ggml_backend_cann_init(0);
			
 
				-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
			
 
				+   new_clip->backend = ggml_backend_cann_init(0);
			
 
				+   LOG_INF("%s: CLIP using CANN backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_VULKAN
			
 
				-    new_clip->backend = ggml_backend_vk_init(0);
			
 
				-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
			
 
				+   new_clip->backend = ggml_backend_vk_init(0);
			
 
				+   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_SYCL
			
 
				-    new_clip->backend = ggml_backend_sycl_init(0);
			
 
				-    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
			
 
				+   new_clip->backend = ggml_backend_sycl_init(0);
			
 
				+   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				     if (!new_clip->backend) {
			
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/common.cpp
+++ b/llama/common.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -44,6 +44,7 @@
 
				 #include <cstdarg>
			
 
				 #include <cstring>
			
 
				 #include <ctime>
			
 
				+#include <filesystem>
			
 
				 #include <fstream>
			
 
				 #include <iostream>
			
 
				 #include <iterator>
			
@@ -88,7 +89,9 @@
 
				 #ifdef __linux__
			
 
				 #include <linux/limits.h>
			
 
				 #elif defined(_WIN32)
			
 
				-#define PATH_MAX MAX_PATH
			
 
				+#   if !defined(PATH_MAX)
			
 
				+#   define PATH_MAX MAX_PATH
			
 
				+#   endif
			
 
				 #else
			
 
				 #include <sys/syslimits.h>
			
 
				 #endif
			
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
 
				     }
			
 
				 
			
 
				     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
			
 
				-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
			
 
				-        llama_free_model(model);
			
 
				-        return iparams;
			
 
				+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
			
 
				+        params.ctx_shift = false;
			
 
				     }
			
 
				 
			
 
				     if (!params.control_vectors.empty()) {
			
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
 
				 
			
 
				     // load and optionally apply lora adapters
			
 
				     for (auto & la : params.lora_adapters) {
			
 
				-        common_lora_adapter_container loaded_la;
			
 
				-        loaded_la.path = la.path;
			
 
				-        loaded_la.scale = la.scale;
			
 
				-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
			
 
				-        if (loaded_la.adapter == nullptr) {
			
 
				+        llama_lora_adapter_ptr lora;
			
 
				+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
			
 
				+        if (lora == nullptr) {
			
 
				             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
			
 
				             llama_free(lctx);
			
 
				             llama_free_model(model);
			
 
				             return iparams;
			
 
				         }
			
 
				-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
			
 
				+
			
 
				+        la.ptr = lora.get();
			
 
				+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
			
 
				     }
			
 
				+
			
 
				     if (!params.lora_init_without_apply) {
			
 
				-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
			
 
				+        common_lora_adapters_apply(lctx, params.lora_adapters);
			
 
				     }
			
 
				 
			
 
				     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
			
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
 
				         params.sampling.ignore_eos = false;
			
 
				     }
			
 
				 
			
 
				+    if (params.sampling.ignore_eos) {
			
 
				+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
			
 
				+            if (llama_token_is_eog(model, i)) {
			
 
				+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
			
 
				+                params.sampling.logit_bias.push_back({i, -INFINITY});
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (params.sampling.penalty_last_n == -1) {
			
 
				+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
			
 
				+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
			
 
				+    }
			
 
				+
			
 
				+    if (params.sampling.dry_penalty_last_n == -1) {
			
 
				+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
			
 
				+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
			
 
				+    }
			
 
				+
			
 
				     if (params.warmup) {
			
 
				         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
			
 
				 
			
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
 
				         llama_perf_context_reset(lctx);
			
 
				     }
			
 
				 
			
 
				-    iparams.model   = model;
			
 
				-    iparams.context = lctx;
			
 
				+    iparams.model.reset(model);
			
 
				+    iparams.context.reset(lctx);
			
 
				 
			
 
				     return iparams;
			
 
				 }
			
 
				 
			
 
				-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
			
 
				+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
			
 
				     llama_lora_adapter_clear(ctx);
			
 
				-    for (auto & la : lora_adapters) {
			
 
				+    for (auto & la : lora) {
			
 
				         if (la.scale != 0.0f) {
			
 
				-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
			
 
				+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 
				 #define CURL_MAX_RETRY 3
			
 
				 #define CURL_RETRY_DELAY_SECONDS 2
			
 
				 
			
 
				-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
			
 
				+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
			
 
				     int remaining_attempts = max_attempts;
			
 
				 
			
 
				     while (remaining_attempts > 0) {
			
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 
				 }
			
 
				 
			
 
				 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
			
 
				-
			
 
				     // Initialize libcurl
			
 
				     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
			
 
				     if (!curl) {
			
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 
				 #endif
			
 
				 
			
 
				     // Check if the file already exists locally
			
 
				-    struct stat model_file_info;
			
 
				-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
			
 
				+    auto file_exists = std::filesystem::exists(path);
			
 
				 
			
 
				     // If the file exists, check its JSON metadata companion file.
			
 
				     std::string metadata_path = path + ".json";
			
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
 
				         std::string etag;
			
 
				         std::string last_modified;
			
 
				     };
			
 
				+
			
 
				     common_load_model_from_url_headers headers;
			
 
				+
			
 
				     {
			
 
				         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
			
 
				         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
			
 
				-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
			
 
				+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
			
 
				 
			
 
				             static std::regex header_regex("([^:]+): (.*)\r\n");
			
 
				             static std::regex etag_regex("ETag", std::regex_constants::icase);
			
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 
				 // Chat template utils
			
 
				 //
			
 
				 
			
 
				+std::string common_get_builtin_chat_template(const struct llama_model * model) {
			
 
				+    static const char * template_key = "tokenizer.chat_template";
			
 
				+    // call with NULL buffer to get the total size of the string
			
 
				+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
			
 
				+    if (res > 0) {
			
 
				+        std::vector<char> model_template(res + 1, 0);
			
 
				+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
			
 
				+        return std::string(model_template.data(), model_template.size() - 1);
			
 
				+    }
			
 
				+    return "";
			
 
				+}
			
 
				+
			
 
				 bool common_chat_verify_template(const std::string & tmpl) {
			
 
				     llama_chat_message chat[] = {{"user", "test"}};
			
 
				     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
			
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
 
				             break;
			
 
				         case 0: // max absolute
			
 
				             for (int i = 0; i < n; i++) {
			
 
				-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
			
 
				+                if (sum < std::abs(inp[i])) {
			
 
				+                    sum = std::abs(inp[i]);
			
 
				+                }
			
 
				             }
			
 
				             sum /= 32760.0; // make an int16 range
			
 
				             break;
			
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -28,7 +28,7 @@
 
				 
			
 
				 #pragma once
			
 
				 
			
 
				-#include "llama.h"
			
 
				+#include "llama-cpp.h"
			
 
				 
			
 
				 #include <string>
			
 
				 #include <vector>
			
@@ -53,10 +53,8 @@
 
				 struct common_lora_adapter_info {
			
 
				     std::string path;
			
 
				     float scale;
			
 
				-};
			
 
				 
			
 
				-struct common_lora_adapter_container : common_lora_adapter_info {
			
 
				-    struct llama_lora_adapter * adapter;
			
 
				+    struct llama_lora_adapter * ptr;
			
 
				 };
			
 
				 
			
 
				 using llama_tokens = std::vector<llama_token>;
			
@@ -106,6 +104,7 @@ enum llama_example {
 
				     LLAMA_EXAMPLE_LLAVA,
			
 
				     LLAMA_EXAMPLE_LOOKUP,
			
 
				     LLAMA_EXAMPLE_PARALLEL,
			
 
				+    LLAMA_EXAMPLE_TTS,
			
 
				 
			
 
				     LLAMA_EXAMPLE_COUNT,
			
 
				 };
			
@@ -121,6 +120,7 @@ enum common_sampler_type {
 
				     COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
			
 
				     COMMON_SAMPLER_TYPE_XTC         = 8,
			
 
				     COMMON_SAMPLER_TYPE_INFILL      = 9,
			
 
				+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
			
 
				 };
			
 
				 
			
 
				 // dimensionality reduction methods, used by cvector-generator
			
@@ -156,7 +156,6 @@ struct common_params_sampling {
 
				     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
			
 
				     float   mirostat_tau       = 5.00f; // target entropy
			
 
				     float   mirostat_eta       = 0.10f; // learning rate
			
 
				-    bool    penalize_nl        = false; // consider newlines as a repeatable token
			
 
				     bool    ignore_eos         = false;
			
 
				     bool    no_perf            = false; // disable performance metrics
			
 
				     bool    timing_per_token   = false;
			
@@ -165,6 +164,7 @@ struct common_params_sampling {
 
				 
			
 
				 
			
 
				     std::vector<enum common_sampler_type> samplers = {
			
 
				+        COMMON_SAMPLER_TYPE_PENALTIES,
			
 
				         COMMON_SAMPLER_TYPE_DRY,
			
 
				         COMMON_SAMPLER_TYPE_TOP_K,
			
 
				         COMMON_SAMPLER_TYPE_TYPICAL_P,
			
@@ -184,6 +184,7 @@ struct common_params_sampling {
 
				 
			
 
				 struct common_params_speculative {
			
 
				     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
			
 
				+
			
 
				     int32_t n_ctx        =     0; // draft context size
			
 
				     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
			
 
				     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
			
@@ -197,6 +198,14 @@ struct common_params_speculative {
 
				     std::string model = ""; // draft model for speculative decoding                          // NOLINT
			
 
				 };
			
 
				 
			
 
				+struct common_params_vocoder {
			
 
				+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
			
 
				+    std::string hf_file = ""; // HF file                                                     // NOLINT
			
 
				+
			
 
				+    std::string model     = ""; // model path                                                // NOLINT
			
 
				+    std::string model_url = ""; // model url to download                                     // NOLINT
			
 
				+};
			
 
				+
			
 
				 struct common_params {
			
 
				     int32_t n_predict             =    -1; // new tokens to predict
			
 
				     int32_t n_ctx                 =  4096; // context size
			
@@ -219,11 +228,13 @@ struct common_params {
 
				     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
			
 
				 
			
 
				     // offload params
			
 
				-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
			
 
				-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
			
 
				-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
			
 
				-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
			
 
				-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
			
 
				+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
			
 
				+
			
 
				+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
			
 
				+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
			
 
				+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
			
 
				+
			
 
				+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
			
 
				 
			
 
				     struct cpu_params cpuparams;
			
 
				     struct cpu_params cpuparams_batch;
			
@@ -237,8 +248,9 @@ struct common_params {
 
				     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
			
 
				     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
			
 
				 
			
 
				-    struct common_params_sampling sampling;
			
 
				+    struct common_params_sampling    sampling;
			
 
				     struct common_params_speculative speculative;
			
 
				+    struct common_params_vocoder     vocoder;
			
 
				 
			
 
				     std::string model                = ""; // model path                                                    // NOLINT
			
 
				     std::string model_alias          = ""; // model alias                                                   // NOLINT
			
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
 
				 // Model utils
			
 
				 //
			
 
				 
			
 
				+// note: defines object's lifetime
			
 
				 struct common_init_result {
			
 
				-    struct llama_model   * model   = nullptr;
			
 
				-    struct llama_context * context = nullptr;
			
 
				-    std::vector<common_lora_adapter_container> lora_adapters;
			
 
				+    llama_model_ptr   model;
			
 
				+    llama_context_ptr context;
			
 
				+
			
 
				+    std::vector<llama_lora_adapter_ptr> lora;
			
 
				 };
			
 
				 
			
 
				 struct common_init_result     common_init_from_params(common_params & params);
			
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
 
				     const struct llama_model_params & params);
			
 
				 
			
 
				 // clear LoRA adapters from context, then apply new list of adapters
			
 
				-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
			
 
				+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
			
 
				 
			
 
				 //
			
 
				 // Batch utils
			
@@ -583,6 +597,9 @@ struct common_chat_msg {
 
				     std::string content;
			
 
				 };
			
 
				 
			
 
				+// Get the built-in chat template for the model. Return empty string if not present.
			
 
				+std::string common_get_builtin_chat_template(const struct llama_model * model);
			
 
				+
			
 
				 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
			
 
				 bool common_chat_verify_template(const std::string & tmpl);
			
 
				 
			
@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 
				 // Embedding utils
			
 
				 //
			
 
				 
			
 
				-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
			
 
				+// TODO: repace embd_norm with an enum
			
 
				+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
			
 
				 
			
 
				 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
			
 
				 
			
@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 
				 // Split utils
			
 
				 //
			
 
				 
			
 
				-static const char * const LLM_KV_SPLIT_NO            = "split.no";
			
 
				-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
			
 
				-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
			
 
				+namespace {
			
 
				+
			
 
				+const char * const LLM_KV_SPLIT_NO            = "split.no";
			
 
				+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
			
 
				+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
			
 
				+
			
 
				+}
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
 
				         size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
			
 
				         hn->buffer_id = buffer_id;
			
 
				         hn->offset = offset;
			
 
				-        return;
			
 
				     }
			
 
				 }
			
 
				 
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend-reg.cpp
+++ b/llama/ggml-backend-reg.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -92,6 +92,26 @@
 
				 #include "ggml-kompute.h"
			
 
				 #endif
			
 
				 
			
 
				+// disable C++17 deprecation warning for std::codecvt_utf8
			
 
				+#if defined(__clang__)
			
 
				+#    pragma clang diagnostic push
			
 
				+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
			
 
				+#endif
			
 
				+
			
 
				+static std::wstring utf8_to_utf16(const std::string & str) {
			
 
				+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
			
 
				+    return converter.from_bytes(str);
			
 
				+}
			
 
				+
			
 
				+static std::string utf16_to_utf8(const std::wstring & str) {
			
 
				+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
			
 
				+    return converter.to_bytes(str);
			
 
				+}
			
 
				+
			
 
				+#if defined(__clang__)
			
 
				+#    pragma clang diagnostic pop
			
 
				+#endif
			
 
				+
			
 
				 #ifdef _WIN32
			
 
				 
			
 
				 using dl_handle = std::remove_pointer_t<HMODULE>;
			
@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
 
				     return handle;
			
 
				 }
			
 
				 
			
 
				-static dl_handle * dl_load_library(const std::string & path) {
			
 
				-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
			
 
				-    return dl_load_library(converter.from_bytes(path));
			
 
				-}
			
 
				-
			
 
				 static void * dl_get_sym(dl_handle * handle, const char * name) {
			
 
				     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
			
 
				     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
			
@@ -140,8 +155,8 @@ struct dl_handle_deleter {
 
				     }
			
 
				 };
			
 
				 
			
 
				-static void * dl_load_library(const std::string & path) {
			
 
				-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
			
 
				+static void * dl_load_library(const std::wstring & path) {
			
 
				+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
			
 
				 
			
 
				     return handle;
			
 
				 }
			
@@ -182,9 +197,9 @@ struct ggml_backend_registry {
 
				 #ifdef GGML_USE_CANN
			
 
				         register_backend(ggml_backend_cann_reg());
			
 
				 #endif
			
 
				-#ifdef GGML_USE_BLAS
			
 
				-        register_backend(ggml_backend_blas_reg());
			
 
				-#endif
			
 
				+// #ifdef GGML_USE_BLAS
			
 
				+//         register_backend(ggml_backend_blas_reg());
			
 
				+// #endif
			
 
				 #ifdef GGML_USE_RPC
			
 
				         register_backend(ggml_backend_rpc_reg());
			
 
				 #endif
			
@@ -228,11 +243,11 @@ struct ggml_backend_registry {
 
				         devices.push_back(device);
			
 
				     }
			
 
				 
			
 
				-    ggml_backend_reg_t load_backend(const char * path, bool silent) {
			
 
				+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
			
 
				         dl_handle_ptr handle { dl_load_library(path) };
			
 
				         if (!handle) {
			
 
				             if (!silent) {
			
 
				-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
			
 
				+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
			
 
				             }
			
 
				             return nullptr;
			
 
				         }
			
@@ -240,7 +255,7 @@ struct ggml_backend_registry {
 
				         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
			
 
				         if (score_fn && score_fn() == 0) {
			
 
				             if (!silent) {
			
 
				-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
			
 
				+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
			
 
				             }
			
 
				             return nullptr;
			
 
				         }
			
@@ -248,7 +263,7 @@ struct ggml_backend_registry {
 
				         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
			
 
				         if (!backend_init_fn) {
			
 
				             if (!silent) {
			
 
				-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
			
 
				+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
			
 
				             }
			
 
				             return nullptr;
			
 
				         }
			
@@ -257,16 +272,16 @@ struct ggml_backend_registry {
 
				         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
			
 
				             if (!silent) {
			
 
				                 if (!reg) {
			
 
				-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
			
 
				+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
			
 
				                 } else {
			
 
				                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
			
 
				-                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
			
 
				+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
			
 
				                 }
			
 
				             }
			
 
				             return nullptr;
			
 
				         }
			
 
				 
			
 
				-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
			
 
				+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
			
 
				 
			
 
				         register_backend(reg, std::move(handle));
			
 
				 
			
@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
 
				 
			
 
				 // Dynamic loading
			
 
				 ggml_backend_reg_t ggml_backend_load(const char * path) {
			
 
				-    return get_reg().load_backend(path, false);
			
 
				+    return get_reg().load_backend(utf8_to_utf16(path), false);
			
 
				 }
			
 
				 
			
 
				 void ggml_backend_unload(ggml_backend_reg_t reg) {
			
 
				     get_reg().unload_backend(reg, true);
			
 
				 }
			
 
				 
			
 
				-static std::string get_executable_path() {
			
 
				+static std::wstring get_executable_path() {
			
 
				 #if defined(__APPLE__)
			
 
				     // get executable path
			
 
				     std::vector<char> path;
			
@@ -427,13 +442,17 @@ static std::string get_executable_path() {
 
				     if (last_slash != std::string::npos) {
			
 
				         base_path = base_path.substr(0, last_slash);
			
 
				     }
			
 
				-    return base_path + "/";
			
 
				-#elif defined(__linux__)
			
 
				+    return utf8_to_utf16(base_path + "/");
			
 
				+#elif defined(__linux__) || defined(__FreeBSD__)
			
 
				     std::string base_path = ".";
			
 
				     std::vector<char> path(1024);
			
 
				     while (true) {
			
 
				         // get executable path
			
 
				+#    if defined(__linux__)
			
 
				         ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
			
 
				+#    elif defined(__FreeBSD__)
			
 
				+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
			
 
				+#    endif
			
 
				         if (len == -1) {
			
 
				             break;
			
 
				         }
			
@@ -449,57 +468,63 @@ static std::string get_executable_path() {
 
				         path.resize(path.size() * 2);
			
 
				     }
			
 
				 
			
 
				-    return base_path + "/";
			
 
				+    return utf8_to_utf16(base_path + "/");
			
 
				 #elif defined(_WIN32)
			
 
				-    std::vector<char> path(MAX_PATH);
			
 
				-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
			
 
				+    std::vector<wchar_t> path(MAX_PATH);
			
 
				+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
			
 
				     if (len == 0) {
			
 
				-        return "";
			
 
				+        return {};
			
 
				     }
			
 
				-    std::string base_path(path.data(), len);
			
 
				+    std::wstring base_path(path.data(), len);
			
 
				     // remove executable name
			
 
				     auto last_slash = base_path.find_last_of('\\');
			
 
				     if (last_slash != std::string::npos) {
			
 
				         base_path = base_path.substr(0, last_slash);
			
 
				     }
			
 
				-    return base_path + "\\";
			
 
				+    return base_path + L"\\";
			
 
				+#else
			
 
				+    return {};
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static std::wstring backend_filename_prefix() {
			
 
				+#ifdef _WIN32
			
 
				+    return L"ggml-";
			
 
				+#else
			
 
				+    return L"libggml-";
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static std::string backend_filename_prefix() {
			
 
				+static std::wstring backend_filename_suffix() {
			
 
				 #ifdef _WIN32
			
 
				-    return "ggml-";
			
 
				+    return L".dll";
			
 
				 #else
			
 
				-    return "libggml-";
			
 
				+    return L".so";
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static std::string backend_filename_suffix() {
			
 
				+static std::wstring path_separator() {
			
 
				 #ifdef _WIN32
			
 
				-    return ".dll";
			
 
				+    return L"\\";
			
 
				 #else
			
 
				-    return ".so";
			
 
				+    return L"/";
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
			
 
				     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
			
 
				      // TODO: search system paths
			
 
				-    std::string file_prefix = backend_filename_prefix() + name + "-";
			
 
				-    std::vector<std::string> search_paths;
			
 
				+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
			
 
				+    std::vector<std::wstring> search_paths;
			
 
				     if (user_search_path == nullptr) {
			
 
				-        search_paths.push_back("./");
			
 
				+        search_paths.push_back(L"." + path_separator());
			
 
				         search_paths.push_back(get_executable_path());
			
 
				     } else {
			
 
				-#if defined(_WIN32)
			
 
				-        search_paths.push_back(std::string(user_search_path) + "\\");
			
 
				-#else
			
 
				-        search_paths.push_back(std::string(user_search_path) + "/");
			
 
				-#endif
			
 
				+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
			
 
				     }
			
 
				 
			
 
				     int best_score = 0;
			
 
				-    std::string best_path;
			
 
				+    std::wstring best_path;
			
 
				 
			
 
				     namespace fs = std::filesystem;
			
 
				     for (const auto & search_path : search_paths) {
			
@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
 
				         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
			
 
				         for (const auto & entry : dir_it) {
			
 
				             if (entry.is_regular_file()) {
			
 
				-                std::string filename = entry.path().filename().string();
			
 
				-                std::string ext = entry.path().extension().string();
			
 
				+                std::wstring filename = entry.path().filename().wstring();
			
 
				+                std::wstring ext = entry.path().extension().wstring();
			
 
				                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
			
 
				-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
			
 
				+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
			
 
				                     if (!handle && !silent) {
			
 
				-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
			
 
				+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
			
 
				                     }
			
 
				                     if (handle) {
			
 
				                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
			
 
				                         if (score_fn) {
			
 
				                             int s = score_fn();
			
 
				 #ifndef NDEBUG
			
 
				-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
			
 
				+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
			
 
				 #endif
			
 
				                             if (s > best_score) {
			
 
				                                 best_score = s;
			
 
				-                                best_path = entry.path().string();
			
 
				+                                best_path = entry.path().wstring();
			
 
				                             }
			
 
				                         } else {
			
 
				                             if (!silent) {
			
 
				-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
			
 
				+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
			
 
				                             }
			
 
				                         }
			
 
				                     }
			
@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
 
				     if (best_score == 0) {
			
 
				         // try to load the base backend
			
 
				         for (const auto & search_path : search_paths) {
			
 
				-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
			
 
				+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
			
 
				             if (fs::exists(path)) {
			
 
				-                return get_reg().load_backend(path.c_str(), silent);
			
 
				+                return get_reg().load_backend(path, silent);
			
 
				             }
			
 
				         }
			
 
				         return nullptr;
			
 
				     }
			
 
				 
			
 
				-    return get_reg().load_backend(best_path.c_str(), silent);
			
 
				+    return get_reg().load_backend(best_path, silent);
			
 
				 }
			
 
				 
			
 
				 void ggml_backend_load_all() {
			
--- a/llama/ggml-backend.cpp
+++ b/llama/ggml-backend.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
 
				     for (int i = 0; i < graph->n_nodes; i++) {
			
 
				         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
			
 
				             ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
			
 
				-            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
			
 
				+            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
			
 
				                 sched->splits[cur_split].n_inputs);
			
 
				             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
			
 
				+                if (j == 0) {
			
 
				+                    GGML_LOG_DEBUG(": ");
			
 
				+                }
			
 
				                 GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
			
 
				                     fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
			
 
				             }
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-blas.cpp
+++ b/llama/ggml-blas.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpp.h
+++ b/llama/ggml-cpp.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu-aarch64.cpp
+++ b/llama/ggml-cpu-aarch64.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
 
				 }
			
 
				 
			
 
				 static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
			
 
				-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
			
 
				+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
			
 
				     const __m256i zero = _mm256_setzero_si256();
			
 
				     return _mm256_dpbusd_epi32(zero, ax, sy);
			
 
				+#elif defined(__AVXVNNI__)
			
 
				+    const __m256i zero = _mm256_setzero_si256();
			
 
				+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
			
 
				 #else
			
 
				     // Perform multiplication and create 16-bit values
			
 
				     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
			
@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 
				 
			
 
				 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
			
 
				     if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
			
 
				-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
			
 
				+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
			
 
				 
			
 
				         for (int c = 0; c < nc; c += ncols_interleaved) {
			
 
				-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
			
 
				+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
			
 
				             float32x4_t acc = vdupq_n_f32(0);
			
 
				             for (int b = 0; b < nb; b++) {
			
 
				-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
			
 
				-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
			
 
				-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
			
 
				-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
			
 
				-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
			
 
				+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
			
 
				+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
			
 
				+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
			
 
				+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
			
 
				+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
			
 
				 
			
 
				                 int8x16_t a0 = vld1q_s8(a_ptr->qs);
			
 
				                 int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
			
 
				-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
			
 
				+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
			
 
				 
			
 
				                 int32x4_t ret = vdupq_n_s32(0);
			
 
				 
			
@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 
				     UNUSED(ncols_interleaved);
			
 
				     UNUSED(blocklen);
			
 
				 
			
 
				-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
			
 
				-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
			
 
				-        const void * b_ptr = vx;
			
 
				-        const void * a_ptr = vy;
			
 
				-        float * res_ptr = s;
			
 
				+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
			
 
				+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
			
 
				+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
			
 
				 
			
 
				-        __asm__ __volatile__(
			
 
				-            "movi v2.16b, #0x4\n"
			
 
				-            "movi v1.16b, #0xf0\n"
			
 
				-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
			
 
				-            "1:"  // Column loop
			
 
				-            "add x23, %x[a_ptr], #0x2\n"
			
 
				-            "movi v0.16b, #0x0\n"
			
 
				-            "mov x22, %x[nb]\n"
			
 
				-            "2:"  // Block loop
			
 
				-            "ldr q31, [%x[b_ptr], #0x0]\n"
			
 
				-            "ldr q30, [%x[b_ptr], #0x10]\n"
			
 
				-            "mov x21, x23\n"
			
 
				-            "movi v29.4s, #0x0\n"
			
 
				-            "ldr q28, [%x[b_ptr], #0x20]\n"
			
 
				-            "ldr q27, [%x[b_ptr], #0x30]\n"
			
 
				-            "movi v26.4s, #0x0\n"
			
 
				-            "sub x20, x23, #0x2\n"
			
 
				-            "ld1r { v25.8h }, [x20]\n"
			
 
				-            "ldr q24, [%x[b_ptr], #-0x8]\n"
			
 
				-            "sub x22, x22, #0x1\n"
			
 
				-            "add x23, x23, #0x22\n"
			
 
				-            "ld1r { v23.2d }, [x21], #0x8\n"
			
 
				-            "sshl v22.16b, v31.16b, v2.16b\n"
			
 
				-            "sshl v16.16b, v30.16b, v2.16b\n"
			
 
				-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
			
 
				-            "ld1r { v21.2d }, [x21], #0x8\n"
			
 
				-            "sshl v20.16b, v28.16b, v2.16b\n"
			
 
				-            "sshl v19.16b, v27.16b, v2.16b\n"
			
 
				-            "ld1r { v18.2d }, [x21], #0x8\n"
			
 
				-            "ld1r { v17.2d }, [x21], #0x8\n"
			
 
				-            "and v31.16b, v31.16b, v1.16b\n"
			
 
				-            "and v30.16b, v30.16b, v1.16b\n"
			
 
				-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
			
 
				-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
			
 
				-            "and v28.16b, v28.16b, v1.16b\n"
			
 
				-            "and v27.16b, v27.16b, v1.16b\n"
			
 
				-            "fcvtl v25.4s, v25.4h\n"
			
 
				-            "fcvtl v16.4s, v24.4h\n"
			
 
				-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
			
 
				-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
			
 
				-            "fmul v16.4s, v16.4s, v25.4s\n"
			
 
				-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
			
 
				-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
			
 
				-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
			
 
				-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
			
 
				-            "addp v29.4s, v29.4s, v26.4s\n"
			
 
				-            "scvtf v29.4s, v29.4s, #0x4\n"
			
 
				-            "fmla v0.4s, v29.4s, v16.4s\n"
			
 
				-            "cbnz x22, 2b\n"
			
 
				-            "sub %x[nc], %x[nc], #0x4\n"
			
 
				-            "str q0, [%x[res_ptr], #0x0]\n"
			
 
				-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
			
 
				-            "cbnz %x[nc], 1b\n"
			
 
				-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
			
 
				-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
			
 
				-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
			
 
				-        );
			
 
				+        for (int c = 0; c < nc; c += ncols_interleaved) {
			
 
				+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
			
 
				+            float32x4_t acc = vdupq_n_f32(0);
			
 
				+            for (int b = 0; b < nb; b++) {
			
 
				+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
			
 
				+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
			
 
				+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
			
 
				+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
			
 
				+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
			
 
				+
			
 
				+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
			
 
				+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
			
 
				+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
			
 
				+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
			
 
				+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
			
 
				+
			
 
				+                int32x4_t ret0 = vdupq_n_s32(0);
			
 
				+                int32x4_t ret1 = vdupq_n_s32(0);
			
 
				+
			
 
				+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
			
 
				+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
			
 
				+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
			
 
				+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
			
 
				+
			
 
				+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
			
 
				+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
			
 
				+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
			
 
				+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
			
 
				+
			
 
				+                int32x4_t ret = vpaddq_s32(ret0, ret1);
			
 
				+
			
 
				+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
			
 
				+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
			
 
				+                a_ptr++;
			
 
				+                b_ptr++;
			
 
				+            }
			
 
				+            vst1q_f32(s, acc);
			
 
				+            s += ncols_interleaved;
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
			
 
				+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
			
 
				     float sumf[4];
			
 
				     int sumi;
			
 
				 
			
--- a/llama/ggml-cpu-aarch64.h
+++ b/llama/ggml-cpu-aarch64.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu-impl.h
+++ b/llama/ggml-cpu-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu-quants.c
+++ b/llama/ggml-cpu-quants.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -129,10 +129,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 
				 }
			
 
				 
			
 
				 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
			
 
				-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
			
 
				+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
			
 
				     const __m256i zero = _mm256_setzero_si256();
			
 
				     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
			
 
				     return _mm256_cvtepi32_ps(summed_pairs);
			
 
				+#elif defined(__AVXVNNI__)
			
 
				+    const __m256i zero = _mm256_setzero_si256();
			
 
				+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
			
 
				+    return _mm256_cvtepi32_ps(summed_pairs);
			
 
				 #else
			
 
				     // Perform multiplication and create 16-bit values
			
 
				     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
			
--- a/llama/ggml-cpu-quants.h
+++ b/llama/ggml-cpu-quants.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu-traits.cpp
+++ b/llama/ggml-cpu-traits.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu-traits.h
+++ b/llama/ggml-cpu-traits.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cpu.c
+++ b/llama/ggml-cpu.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -1012,7 +1012,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 
				 #define GGML_F16_STEP 32
			
 
				 #define GGML_F16_EPR  4
			
 
				 
			
 
				-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
			
 
				+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
			
 
				     float tmp[4];
			
 
				 
			
 
				     tmp[0] = GGML_FP16_TO_FP32(x[0]);
			
@@ -1023,7 +1023,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
 
				     return _mm_loadu_ps(tmp);
			
 
				 }
			
 
				 
			
 
				-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
			
 
				+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
			
 
				     float arr[4];
			
 
				 
			
 
				     _mm_storeu_ps(arr, y);
			
@@ -7445,14 +7445,14 @@ static void ggml_compute_forward_mul_mat(
 
				     if (src1_cont) {
			
 
				         for (int64_t i13 = 0; i13 < ne13; i13++)
			
 
				             for (int64_t i12 = 0; i12 < ne12; i12++)
			
 
				-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
			
 
				+                if (!llamafile_sgemm(params,
			
 
				+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
			
 
				                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
			
 
				                                      nb01/ggml_type_size(src0->type),
			
 
				                                      (const char *)src1->data + i12*nb12 + i13*nb13,
			
 
				                                      nb11/ggml_type_size(src1->type),
			
 
				                                      (char *)dst->data + i12*nb2 + i13*nb3,
			
 
				                                      nb1/ggml_type_size(dst->type),
			
 
				-                                     ith, nth,
			
 
				                                      src0->type,
			
 
				                                      src1->type,
			
 
				                                      dst->type))
			
@@ -7497,14 +7497,14 @@ UseGgmlGemm1:;
 
				 
			
 
				         for (int64_t i13 = 0; i13 < ne13; i13++)
			
 
				             for (int64_t i12 = 0; i12 < ne12; i12++)
			
 
				-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
			
 
				+                if (!llamafile_sgemm(params,
			
 
				+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
			
 
				                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
			
 
				                                      nb01/ggml_type_size(src0->type),
			
 
				                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
			
 
				                                      row_size/ggml_type_size(vec_dot_type),
			
 
				                                      (char *)dst->data + i12*nb2 + i13*nb3,
			
 
				                                      nb1/ggml_type_size(dst->type),
			
 
				-                                     ith, nth,
			
 
				                                      src0->type,
			
 
				                                      vec_dot_type,
			
 
				                                      dst->type))
			
--- a/llama/ggml-cpu.cpp
+++ b/llama/ggml-cpu.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -419,8 +419,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
 
				     switch (op->op) {
			
 
				         case GGML_OP_CPY:
			
 
				             return
			
 
				+                op->type != GGML_TYPE_IQ3_XXS &&
			
 
				+                op->type != GGML_TYPE_IQ3_S   &&
			
 
				                 op->type != GGML_TYPE_IQ2_XXS &&
			
 
				                 op->type != GGML_TYPE_IQ2_XS  &&
			
 
				+                op->type != GGML_TYPE_IQ2_S   &&
			
 
				                 op->type != GGML_TYPE_IQ1_S   &&
			
 
				                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
			
 
				         case GGML_OP_MUL_MAT:
			
@@ -544,6 +547,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
 
				         if (ggml_cpu_has_sve()) {
			
 
				             features.push_back({ "SVE", "1" });
			
 
				         }
			
 
				+        if (ggml_cpu_has_dotprod()) {
			
 
				+            features.push_back({ "DOTPROD", "1" });
			
 
				+        }
			
 
				+        if (ggml_cpu_has_matmul_int8()) {
			
 
				+            features.push_back({ "MATMUL_INT8", "1" });
			
 
				+        }
			
 
				         if (ggml_cpu_get_sve_cnt() > 0) {
			
 
				             static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
			
 
				             features.push_back({ "SVE_CNT", sve_cnt.c_str() });
			
--- a/llama/ggml-cpu.h
+++ b/llama/ggml-cpu.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argmax.cu
+++ b/llama/ggml-cuda/argmax.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argmax.cuh
+++ b/llama/ggml-cuda/argmax.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -706,6 +706,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 
				             return dequantize_row_iq3_s_cuda;
			
 
				         case GGML_TYPE_F16:
			
 
				             return convert_unary_cuda<half>;
			
 
				+        case GGML_TYPE_BF16:
			
 
				+            return convert_unary_cuda<nv_bfloat16>;
			
 
				         default:
			
 
				             return nullptr;
			
 
				     }
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/count-equal.cu
+++ b/llama/ggml-cuda/count-equal.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/count-equal.cuh
+++ b/llama/ggml-cuda/count-equal.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cross-entropy-loss.cu
+++ b/llama/ggml-cuda/cross-entropy-loss.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/ggml-cuda.cu
+++ b/llama/ggml-cuda/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -1758,7 +1758,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 
				 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
			
 
				     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
			
 
				 
			
 
				-    bool use_mul_mat_vec   = src0->type == GGML_TYPE_F16
			
 
				+    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
			
 
				         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
			
 
				         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
			
 
				     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
			
@@ -2904,6 +2904,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 
				                     case GGML_TYPE_IQ3_XXS:
			
 
				                     case GGML_TYPE_IQ4_NL:
			
 
				                     case GGML_TYPE_IQ4_XS:
			
 
				+                    case GGML_TYPE_BF16:
			
 
				 #ifdef GGML_USE_MUSA
			
 
				                         if (a->type == GGML_TYPE_Q3_K) {
			
 
				                             return false;
			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmv.cu
+++ b/llama/ggml-cuda/mmv.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -27,9 +27,9 @@
 
				 #include "common.cuh"
			
 
				 #include "mmv.cuh"
			
 
				 
			
 
				-template <typename type_acc, int block_size>
			
 
				+template <typename T, typename type_acc, int block_size>
			
 
				 static __global__ void mul_mat_vec(
			
 
				-        const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
			
 
				+        const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
			
 
				         const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
			
 
				     const int64_t row     = blockIdx.x;
			
 
				     const int64_t channel = blockIdx.z;
			
@@ -39,7 +39,6 @@ static __global__ void mul_mat_vec(
 
				     y   +=  channel               *stride_channel_y;
			
 
				     dst +=  channel               *stride_channel_dst;
			
 
				 
			
 
				-    const half2  * x2 = (const half2  *) x;
			
 
				     const float2 * y2 = (const float2 *) y;
			
 
				 
			
 
				     extern __shared__ char data_mmv[];
			
@@ -54,28 +53,44 @@ static __global__ void mul_mat_vec(
 
				 
			
 
				     float sumf;
			
 
				 
			
 
				-    if (std::is_same<type_acc, float>::value) {
			
 
				-        sumf = 0.0f;
			
 
				+    if constexpr (std::is_same<T, half>::value) {
			
 
				+        const half2 * x2 = (const half2 *) x;
			
 
				 
			
 
				-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
			
 
				-            const float2 tmpx = __half22float2(x2[col2]);
			
 
				-            const float2 tmpy = y2[col2];
			
 
				-            sumf += tmpx.x * tmpy.x;
			
 
				-            sumf += tmpx.y * tmpy.y;
			
 
				-        }
			
 
				-    } else {
			
 
				+        if (std::is_same<type_acc, float>::value) {
			
 
				+            sumf = 0.0f;
			
 
				+
			
 
				+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
			
 
				+                const float2 tmpx = __half22float2(x2[col2]);
			
 
				+                const float2 tmpy = y2[col2];
			
 
				+                sumf += tmpx.x * tmpy.x;
			
 
				+                sumf += tmpx.y * tmpy.y;
			
 
				+            }
			
 
				+        } else {
			
 
				 #ifdef FP16_AVAILABLE
			
 
				-        half2 sumh2 = make_half2(0.0f, 0.0f);
			
 
				+            half2 sumh2 = make_half2(0.0f, 0.0f);
			
 
				 
			
 
				-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
			
 
				-            const float2 tmp = y2[col2];
			
 
				-            sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
			
 
				-        }
			
 
				+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
			
 
				+                const float2 tmp = y2[col2];
			
 
				+                sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
			
 
				+            }
			
 
				 
			
 
				-        sumf = __low2float(sumh2) + __high2float(sumh2);
			
 
				+            sumf = __low2float(sumh2) + __high2float(sumh2);
			
 
				 #else
			
 
				-        NO_DEVICE_CODE;
			
 
				+            NO_DEVICE_CODE;
			
 
				 #endif // FP16_AVAILABLE
			
 
				+        }
			
 
				+    } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
			
 
				+        const int * x2 = (const int *) x;
			
 
				+        sumf = 0.0f;
			
 
				+
			
 
				+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
			
 
				+            const int    tmpx = x2[col2];
			
 
				+            const float2 tmpy = y2[col2];
			
 
				+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
			
 
				+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
			
 
				+        }
			
 
				+    } else {
			
 
				+        static_assert(std::is_same<T, void>::value, "unsupported type");
			
 
				     }
			
 
				 
			
 
				     sumf = warp_reduce_sum(sumf);
			
@@ -97,9 +112,9 @@ static __global__ void mul_mat_vec(
 
				     dst[row] = sumf;
			
 
				 }
			
 
				 
			
 
				-template <typename type_acc>
			
 
				+template <typename T, typename type_acc>
			
 
				 static void launch_mul_mat_vec_cuda(
			
 
				-        const half * x, const float * y, float * dst,
			
 
				+        const T * x, const float * y, float * dst,
			
 
				         const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
			
 
				         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
			
 
				         cudaStream_t stream) {
			
@@ -123,35 +138,35 @@ static void launch_mul_mat_vec_cuda(
 
				     const dim3 block_dims(block_size_best, 1, 1);
			
 
				     switch (block_size_best) {
			
 
				         case   32: {
			
 
				-            mul_mat_vec<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc,  32><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case   64: {
			
 
				-            mul_mat_vec<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc,  64><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case   96: {
			
 
				-            mul_mat_vec<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc,  96><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case  128: {
			
 
				-            mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case  160: {
			
 
				-            mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case  192: {
			
 
				-            mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case  224: {
			
 
				-            mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         case  256: {
			
 
				-            mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
			
 
				+            mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
			
 
				                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
			
 
				         } break;
			
 
				         default: {
			
@@ -160,25 +175,25 @@ static void launch_mul_mat_vec_cuda(
 
				     }
			
 
				 }
			
 
				 
			
 
				+template<typename T>
			
 
				 static void mul_mat_vec_cuda(
			
 
				-        const half * x, const float * y, float * dst,
			
 
				+        const T * x, const float * y, float * dst,
			
 
				         const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
			
 
				         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
			
 
				         enum ggml_prec prec, cudaStream_t stream) {
			
 
				     switch (prec) {
			
 
				         case GGML_PREC_DEFAULT: {
			
 
				-            launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
			
 
				+            launch_mul_mat_vec_cuda<T, half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
			
 
				                 stride_channel_x, stride_channel_y, stride_channel_dst, stream);
			
 
				         } break;
			
 
				         case GGML_PREC_F32: {
			
 
				-            launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
			
 
				+            launch_mul_mat_vec_cuda<T, float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
			
 
				                 stride_channel_x, stride_channel_y, stride_channel_dst, stream);
			
 
				         } break;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
			
 
				     GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
			
 
				 
			
@@ -190,7 +205,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
 
				     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
			
 
				     const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
			
 
				 
			
 
				-    const half  * src0_d = (const half  *) src0->data;
			
 
				     const float * src1_d = (const float *) src1->data;
			
 
				     float       *  dst_d = (float       *)  dst->data;
			
 
				 
			
@@ -207,7 +221,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
 
				     const int64_t channel_stride_y   = src1->nb[2] / ggml_type_size(src1->type);
			
 
				     const int64_t channel_stride_dst =  dst->nb[2] / ggml_type_size( dst->type);
			
 
				 
			
 
				-    mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
			
 
				+    switch (src0->type) {
			
 
				+        case GGML_TYPE_F16: {
			
 
				+            const half * src0_d = (const half *) src0->data;
			
 
				+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
			
 
				+                channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
			
 
				+        } break;
			
 
				+        case GGML_TYPE_BF16: {
			
 
				+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
			
 
				+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
			
 
				+                channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
			
 
				+        } break;
			
 
				+        default:
			
 
				+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 void ggml_cuda_op_mul_mat_vec(
			
@@ -216,7 +243,6 @@ void ggml_cuda_op_mul_mat_vec(
 
				     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				     const int64_t src1_padded_row_size, cudaStream_t stream) {
			
 
				 
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
			
 
				     GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
			
 
				 
			
@@ -237,8 +263,20 @@ void ggml_cuda_op_mul_mat_vec(
 
				     const int64_t channel_stride_y   = 0;
			
 
				     const int64_t channel_stride_dst = 0;
			
 
				 
			
 
				-    mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
			
 
				-        nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
			
 
				+    switch (src0->type) {
			
 
				+        case GGML_TYPE_F16: {
			
 
				+            const half * src0_d = (const half *) src0_dd_i;
			
 
				+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
			
 
				+                nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
			
 
				+        } break;
			
 
				+        case GGML_TYPE_BF16: {
			
 
				+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
			
 
				+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
			
 
				+                nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
			
 
				+        } break;
			
 
				+        default:
			
 
				+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
			
 
				+    }
			
 
				 
			
 
				     GGML_UNUSED(ctx);
			
 
				     GGML_UNUSED(src1);
			
--- a/llama/ggml-cuda/mmv.cuh
+++ b/llama/ggml-cuda/mmv.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/opt-step-adamw.cu
+++ b/llama/ggml-cuda/opt-step-adamw.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/out-prod.cu
+++ b/llama/ggml-cuda/out-prod.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sum.cu
+++ b/llama/ggml-cuda/sum.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sum.cuh
+++ b/llama/ggml-cuda/sum.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
			
 
				+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *