6 달 전 · f2890a4494
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/clip.cpp
+++ b/llama/clip.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -29,7 +29,6 @@
 
				 // I'll gradually clean and extend it
			
 
				 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
			
 
				 #include "clip.h"
			
 
				-#include "log.h"
			
 
				 #include "ggml.h"
			
 
				 #include "ggml-alloc.h"
			
 
				 #include "ggml-backend.h"
			
@@ -66,6 +65,11 @@
 
				 #include <cinttypes>
			
 
				 #include <limits>
			
 
				 
			
 
				+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
			
 
				+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
			
 
				+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
			
 
				+#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
			
 
				+
			
 
				 #if defined(_WIN32)
			
 
				 #define WIN32_LEAN_AND_MEAN
			
 
				 #ifndef NOMINMAX
			
@@ -204,7 +208,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 
				 static int get_key_idx(const gguf_context * ctx, const char * key) {
			
 
				     int i = gguf_find_key(ctx, key);
			
 
				     if (i == -1) {
			
 
				-        LOG_TEE("key %s not found in file\n", key);
			
 
				+        LOG_ERR("key %s not found in file\n", key);
			
 
				         throw std::runtime_error(format("Missing required key: %s", key));
			
 
				     }
			
 
				 
			
@@ -309,7 +313,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
				 
			
 
				 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
			
 
				     size_t tensor_size = ggml_nbytes(tensor);
			
 
				-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
			
 
				+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
			
 
				             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
			
 
				             tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
			
 
				 }
			
@@ -327,7 +331,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 
				 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
			
 
				     std::ofstream file(filename, std::ios::binary);
			
 
				     if (!file.is_open()) {
			
 
				-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
			
 
				+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
			
 
				         return;
			
 
				     }
			
 
				 
			
@@ -346,7 +350,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 
				 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
			
 
				     std::ofstream file(filename, std::ios::binary);
			
 
				     if (!file.is_open()) {
			
 
				-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
			
 
				+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
			
 
				         return;
			
 
				     }
			
 
				 
			
@@ -607,7 +611,7 @@ struct clip_ctx {
 
				 
			
 
				 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				-        LOG_TEE("This gguf file seems to have no vision encoder\n");
			
 
				+        LOG_ERR("This gguf file seems to have no vision encoder\n");
			
 
				         return nullptr;
			
 
				     }
			
 
				 
			
@@ -621,7 +625,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
				         if (load_image_size == nullptr) {
			
 
				             load_image_size = clip_image_size_init();
			
 
				         }
			
 
				-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
			
 
				+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
			
 
				         image_size_width  = load_image_size->width;
			
 
				         image_size_height = load_image_size->height;
			
 
				         if (is_inf) {
			
@@ -1086,21 +1090,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         const int idx_name = gguf_find_key(ctx, KEY_NAME);
			
 
				         if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
			
 
				             const std::string name = gguf_get_val_str(ctx, idx_name);
			
 
				-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
			
 
				+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
			
 
				         }
			
 
				-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
			
 
				-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
			
 
				-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
			
 
				-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
			
 
				-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
			
 
				-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
			
 
				-        LOG_TEE("\n");
			
 
				+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
			
 
				+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
			
 
				+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
			
 
				+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
			
 
				+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
			
 
				+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
			
 
				+        LOG_INF("\n");
			
 
				     }
			
 
				     const int n_tensors = gguf_get_n_tensors(ctx);
			
 
				 
			
 
				     // kv
			
 
				     const int n_kv = gguf_get_n_kv(ctx);
			
 
				-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
			
 
				+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
			
 
				         __func__, n_kv, n_tensors, fname);
			
 
				     {
			
 
				         std::map<enum ggml_type, uint32_t> n_type;
			
@@ -1111,7 +1115,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             n_type[type]++;
			
 
				         }
			
 
				 
			
 
				-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
			
 
				+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
			
 
				         for (int i = 0; i < n_kv; i++) {
			
 
				             const char * name           = gguf_get_key(ctx, i);
			
 
				             const enum gguf_type type   = gguf_get_kv_type(ctx, i);
			
@@ -1127,7 +1131,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             }
			
 
				             replace_all(value, "\n", "\\n");
			
 
				 
			
 
				-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
			
 
				+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
			
 
				         }
			
 
				 
			
 
				         // print type counts
			
@@ -1136,7 +1140,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				                 continue;
			
 
				             }
			
 
				 
			
 
				-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
			
 
				+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -1151,7 +1155,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             size_t tensor_size = ggml_nbytes(cur);
			
 
				             model_size += tensor_size;
			
 
				             if (verbosity >= 3) {
			
 
				-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
			
 
				+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
			
 
				                        __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
			
 
				             }
			
 
				         }
			
@@ -1178,27 +1182,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				 
			
 
				 #ifdef GGML_USE_CUDA
			
 
				     new_clip->backend = ggml_backend_cuda_init(0);
			
 
				-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
			
 
				+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_METAL
			
 
				     new_clip->backend = ggml_backend_metal_init();
			
 
				-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
			
 
				+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_CANN
			
 
				     new_clip->backend = ggml_backend_cann_init(0);
			
 
				-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
			
 
				+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				 #ifdef GGML_USE_VULKAN
			
 
				     new_clip->backend = ggml_backend_vk_init(0);
			
 
				-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
			
 
				+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
			
 
				 #endif
			
 
				 
			
 
				     if (!new_clip->backend) {
			
 
				         new_clip->backend = ggml_backend_cpu_init();
			
 
				-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
			
 
				+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
			
 
				     }
			
 
				 
			
 
				     // model size and capabilities
			
@@ -1233,16 +1237,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
			
 
				 
			
 
				         if (verbosity >= 1) {
			
 
				-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
			
 
				-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
			
 
				-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
			
 
				-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
			
 
				-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
			
 
				-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
			
 
				+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
			
 
				+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
			
 
				+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
			
 
				+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
			
 
				+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
			
 
				+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
			
 
				         }
			
 
				     }
			
 
				 
			
 
				-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
			
 
				+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
			
 
				 
			
 
				     // load tensors
			
 
				     {
			
@@ -1255,12 +1259,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				 
			
 
				         new_clip->ctx_data = ggml_init(params);
			
 
				         if (!new_clip->ctx_data) {
			
 
				-            LOG_TEE("%s: ggml_init() failed\n", __func__);
			
 
				+            LOG_ERR("%s: ggml_init() failed\n", __func__);
			
 
				             clip_free(new_clip);
			
 
				             gguf_free(ctx);
			
 
				             return nullptr;
			
 
				         }
			
 
				-
			
 
				 #ifdef _WIN32
			
 
				         int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
			
 
				         if (!wlen) {
			
@@ -1285,7 +1288,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         auto fin = std::ifstream(fname, std::ios::binary);
			
 
				 #endif
			
 
				         if (!fin) {
			
 
				-            LOG_TEE("cannot open model file for loading tensors\n");
			
 
				+            LOG_ERR("cannot open model file for loading tensors\n");
			
 
				             clip_free(new_clip);
			
 
				             gguf_free(ctx);
			
 
				             return nullptr;
			
@@ -1307,7 +1310,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
			
 
				             fin.seekg(offset, std::ios::beg);
			
 
				             if (!fin) {
			
 
				-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
			
 
				+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
			
 
				                 clip_free(new_clip);
			
 
				                 gguf_free(ctx);
			
 
				                 return nullptr;
			
@@ -1382,23 +1385,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         }
			
 
				 
			
 
				         if (verbosity >= 2) {
			
 
				-            LOG_TEE("\n%s: vision model hparams\n", __func__);
			
 
				-            LOG_TEE("image_size         %d\n", hparams.image_size);
			
 
				-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
			
 
				-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
			
 
				-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
			
 
				-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
			
 
				-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
			
 
				-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
			
 
				-            LOG_TEE("v_eps              %f\n", hparams.eps);
			
 
				-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
			
 
				-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
			
 
				-            LOG_TEE("v_image_grid_pinpoints: ");
			
 
				+            LOG_INF("\n%s: vision model hparams\n", __func__);
			
 
				+            LOG_INF("image_size         %d\n", hparams.image_size);
			
 
				+            LOG_INF("patch_size         %d\n", hparams.patch_size);
			
 
				+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
			
 
				+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
			
 
				+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
			
 
				+            LOG_INF("v_n_head           %d\n", hparams.n_head);
			
 
				+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
			
 
				+            LOG_INF("v_eps              %f\n", hparams.eps);
			
 
				+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
			
 
				+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
			
 
				+            LOG_INF("v_image_grid_pinpoints: ");
			
 
				             for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
			
 
				-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
			
 
				+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
			
 
				             }
			
 
				-            LOG_TEE("\n");
			
 
				-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
			
 
				+            LOG_INF("\n");
			
 
				+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
			
 
				 
			
 
				         }
			
 
				 
			
@@ -1436,7 +1439,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
			
 
				             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
			
 
				         } catch(const std::exception& /*e*/) {
			
 
				-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
			
 
				+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
			
 
				         }
			
 
				 
			
 
				         // LLaVA projection
			
@@ -1465,7 +1468,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				             } catch (std::runtime_error & /*e*/) { }
			
 
				             try {
			
 
				                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
			
 
				-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
			
 
				+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
			
 
				             } catch (std::runtime_error & /*e*/) { }
			
 
				         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
			
 
				             // MobileVLM projection
			
@@ -1566,7 +1569,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
				         ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
			
 
				         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
			
 
				         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
			
 
				-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
			
 
				+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
			
 
				     }
			
 
				 
			
 
				     return new_clip;
			
@@ -1617,7 +1620,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
 
				     int nx, ny, nc;
			
 
				     auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
			
 
				     if (!data) {
			
 
				-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
			
 
				+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
			
 
				         return false;
			
 
				     }
			
 
				     build_clip_img_from_data(data, nx, ny, img);
			
@@ -1629,7 +1632,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
 
				     int nx, ny, nc;
			
 
				     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
			
 
				     if (!data) {
			
 
				-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
			
 
				+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
			
 
				         return false;
			
 
				     }
			
 
				     build_clip_img_from_data(data, nx, ny, img);
			
@@ -1819,7 +1822,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
 
				         int downscaled_height = static_cast<int>(original_height * scale);
			
 
				         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
			
 
				         int wasted_resolution = (width * height) - effective_resolution;
			
 
				-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
			
 
				+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
			
 
				         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
			
 
				             max_effective_resolution = effective_resolution;
			
 
				             min_wasted_resolution = wasted_resolution;
			
@@ -1937,7 +1940,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
 
				     const int multiple = fmin(ceil(ratio), max_slice_nums);
			
 
				 
			
 
				     std::vector<std::vector<clip_image_u8 *>> images;
			
 
				-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
			
 
				+    LOG_INF("%s: multiple %d\n", __func__, multiple);
			
 
				     images.push_back(std::vector<clip_image_u8 *>());
			
 
				 
			
 
				     if (multiple <= 1) {
			
@@ -1952,17 +1955,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
 
				         clip_image_u8 * source_image = clip_image_u8_init();
			
 
				         bicubic_resize(*img, *source_image, best_size.first, best_size.second);
			
 
				         // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
			
 
				-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
			
 
				+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
			
 
				         images[images.size()-1].push_back(source_image);
			
 
				 
			
 
				         std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
			
 
				-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
			
 
				+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
			
 
				 
			
 
				         auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
			
 
				         clip_image_u8 * refine_image = clip_image_u8_init();
			
 
				         bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
			
 
				 
			
 
				-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
			
 
				+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
			
 
				 
			
 
				         // split_to_patches
			
 
				         int width = refine_image->nx;
			
@@ -2019,7 +2022,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
				         int idx = 0;
			
 
				         for (size_t i = 0; i < imgs.size(); ++i) {
			
 
				             for (size_t j = 0; j < imgs[i].size(); ++j) {
			
 
				-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
			
 
				+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
			
 
				                 clip_image_f32 * res = clip_image_f32_init();
			
 
				                 normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
			
 
				                 res_imgs->data[idx++] = *res;
			
@@ -2031,7 +2034,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
				 
			
 
				     bool pad_to_square = true;
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				-        LOG_TEE("This gguf file seems to have no vision encoder\n");
			
 
				+        LOG_ERR("This gguf file seems to have no vision encoder\n");
			
 
				         return false;
			
 
				     }
			
 
				     auto & params = ctx->vision_model.hparams;
			
@@ -2108,7 +2111,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
				             }
			
 
				 
			
 
				             for (size_t i = 0; i < patches.size(); i++) {
			
 
				-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
			
 
				+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
			
 
				                 clip_image_u8_free(patches[i]);
			
 
				             }
			
 
				 
			
@@ -2344,7 +2347,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 
				 
			
 
				 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				-        LOG_TEE("This gguf file seems to have no vision encoder\n");
			
 
				+        LOG_ERR("This gguf file seems to have no vision encoder\n");
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2356,7 +2359,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
				 
			
 
				 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
			
 
				     if (!ctx->has_vision_encoder) {
			
 
				-        LOG_TEE("This gguf file seems to have no vision encoder\n");
			
 
				+        LOG_ERR("This gguf file seems to have no vision encoder\n");
			
 
				         return false;
			
 
				     }
			
 
				 
			
@@ -2505,16 +2508,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
				         ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
			
 
				     }
			
 
				 
			
 
				-#ifdef GGML_USE_METAL
			
 
				-    if (ggml_backend_is_metal(ctx->backend)) {
			
 
				-        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				     ggml_backend_graph_compute(ctx->backend, gf);
			
 
				 
			
 
				     // the last node is the embedding tensor
			
 
				-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
			
 
				+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
			
 
				 
			
 
				     // copy the embeddings to the location passed by the user
			
 
				     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
			
@@ -2586,7 +2583,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
				             new_type = type;
			
 
				             if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
			
 
				                 new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
			
 
				-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
			
 
				+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
			
 
				             }
			
 
				             const size_t n_elms = ggml_nelements(cur);
			
 
				             float * f32_data;
			
@@ -2605,7 +2602,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
				                 f32_data = (float *)conv_buf.data();
			
 
				                 break;
			
 
				             default:
			
 
				-                LOG_TEE("Please use an input file in f32 or f16\n");
			
 
				+                LOG_ERR("Please use an input file in f32 or f16\n");
			
 
				                 gguf_free(ctx_out);
			
 
				                 return false;
			
 
				             }
			
@@ -2632,7 +2629,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
				             fout.put(0);
			
 
				         }
			
 
				 
			
 
				-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
			
 
				+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
			
 
				                orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
			
 
				     }
			
 
				 
			
@@ -2648,8 +2645,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 
				     gguf_free(ctx_out);
			
 
				 
			
 
				     {
			
 
				-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
			
 
				-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
			
 
				+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
			
 
				+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
			
 
				     }
			
 
				 
			
 
				     return true;
			
--- a/llama/clip.h
+++ b/llama/clip.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/common.cpp
+++ b/llama/common.cpp
--- a/llama/common.h
+++ b/llama/common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -30,18 +30,9 @@
 
				 
			
 
				 #include "llama.h"
			
 
				 
			
 
				-#include "sampling.h"
			
 
				-
			
 
				-#define LOG_NO_FILE_LINE_FUNCTION
			
 
				-#include "log.h"
			
 
				-
			
 
				-#include <cmath>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				-#include <random>
			
 
				-#include <thread>
			
 
				-#include <unordered_map>
			
 
				-#include <tuple>
			
 
				+#include <sstream>
			
 
				 
			
 
				 #ifdef _WIN32
			
 
				 #define DIRECTORY_SEPARATOR '\\'
			
@@ -80,31 +71,103 @@ struct llama_control_vector_load_info;
 
				 // CPU utils
			
 
				 //
			
 
				 
			
 
				+struct cpu_params {
			
 
				+    int      n_threads                   = -1;
			
 
				+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
			
 
				+    bool     mask_valid                  = false;   // Default: any CPU
			
 
				+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
			
 
				+    bool     strict_cpu                  = false;   // Use strict CPU placement
			
 
				+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
			
 
				+};
			
 
				+
			
 
				 int32_t cpu_get_num_physical_cores();
			
 
				 int32_t cpu_get_num_math();
			
 
				 
			
 
				 //
			
 
				-// CLI argument parsing
			
 
				+// Common params
			
 
				 //
			
 
				 
			
 
				+enum llama_example {
			
 
				+    LLAMA_EXAMPLE_COMMON,
			
 
				+    LLAMA_EXAMPLE_SPECULATIVE,
			
 
				+    LLAMA_EXAMPLE_MAIN,
			
 
				+    LLAMA_EXAMPLE_INFILL,
			
 
				+    LLAMA_EXAMPLE_EMBEDDING,
			
 
				+    LLAMA_EXAMPLE_PERPLEXITY,
			
 
				+    LLAMA_EXAMPLE_RETRIEVAL,
			
 
				+    LLAMA_EXAMPLE_PASSKEY,
			
 
				+    LLAMA_EXAMPLE_IMATRIX,
			
 
				+    LLAMA_EXAMPLE_BENCH,
			
 
				+    LLAMA_EXAMPLE_SERVER,
			
 
				+    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
			
 
				+    LLAMA_EXAMPLE_EXPORT_LORA,
			
 
				+    LLAMA_EXAMPLE_LLAVA,
			
 
				+    LLAMA_EXAMPLE_LOOKUP,
			
 
				+    LLAMA_EXAMPLE_PARALLEL,
			
 
				+
			
 
				+    LLAMA_EXAMPLE_COUNT,
			
 
				+};
			
 
				+
			
 
				+enum gpt_sampler_type {
			
 
				+    GPT_SAMPLER_TYPE_NONE        = 0,
			
 
				+    GPT_SAMPLER_TYPE_TOP_K       = 1,
			
 
				+    GPT_SAMPLER_TYPE_TOP_P       = 2,
			
 
				+    GPT_SAMPLER_TYPE_MIN_P       = 3,
			
 
				+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
			
 
				+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
			
 
				+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
			
 
				+};
			
 
				+
			
 
				 // dimensionality reduction methods, used by cvector-generator
			
 
				 enum dimre_method {
			
 
				     DIMRE_METHOD_PCA,
			
 
				     DIMRE_METHOD_MEAN,
			
 
				 };
			
 
				 
			
 
				-struct cpu_params {
			
 
				-    int      n_threads                   = -1;
			
 
				-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
			
 
				-    bool     mask_valid                  = false;   // Default: any CPU
			
 
				-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
			
 
				-    bool     strict_cpu                  = false;   // Use strict CPU placement
			
 
				-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
			
 
				+// sampler parameters
			
 
				+struct gpt_sampler_params {
			
 
				+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
			
 
				+
			
 
				+    int32_t n_prev            = 64;    // number of previous tokens to remember
			
 
				+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
			
 
				+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
			
 
				+    int32_t top_k             = 40;    // <= 0 to use vocab size
			
 
				+    float   top_p             = 0.95f; // 1.0 = disabled
			
 
				+    float   min_p             = 0.05f; // 0.0 = disabled
			
 
				+    float   tfs_z             = 1.00f; // 1.0 = disabled
			
 
				+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
			
 
				+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
			
 
				+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
			
 
				+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
			
 
				+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
			
 
				+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
			
 
				+    float   penalty_freq      = 0.00f; // 0.0 = disabled
			
 
				+    float   penalty_present   = 0.00f; // 0.0 = disabled
			
 
				+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
			
 
				+    float   mirostat_tau      = 5.00f; // target entropy
			
 
				+    float   mirostat_eta      = 0.10f; // learning rate
			
 
				+    bool    penalize_nl       = false; // consider newlines as a repeatable token
			
 
				+    bool    ignore_eos        = false;
			
 
				+    bool    no_perf           = false; // disable performance metrics
			
 
				+
			
 
				+    std::vector<enum gpt_sampler_type> samplers = {
			
 
				+        GPT_SAMPLER_TYPE_TOP_K,
			
 
				+        GPT_SAMPLER_TYPE_TFS_Z,
			
 
				+        GPT_SAMPLER_TYPE_TYPICAL_P,
			
 
				+        GPT_SAMPLER_TYPE_TOP_P,
			
 
				+        GPT_SAMPLER_TYPE_MIN_P,
			
 
				+        GPT_SAMPLER_TYPE_TEMPERATURE
			
 
				+    };
			
 
				+
			
 
				+    std::string grammar; // optional BNF-like grammar to constrain sampling
			
 
				+
			
 
				+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
			
 
				+
			
 
				+    // print the parameters into a string
			
 
				+    std::string print() const;
			
 
				 };
			
 
				 
			
 
				 struct gpt_params {
			
 
				-    uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
			
 
				-
			
 
				     int32_t n_predict             =    -1; // new tokens to predict
			
 
				     int32_t n_ctx                 =     0; // context size
			
 
				     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
			
@@ -146,26 +209,25 @@ struct gpt_params {
 
				     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
			
 
				     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
			
 
				 
			
 
				-    // // sampling parameters
			
 
				-    struct llama_sampling_params sparams;
			
 
				-
			
 
				-    std::string model                = ""; // model path
			
 
				-    std::string model_draft          = ""; // draft model for speculative decoding
			
 
				-    std::string model_alias          = "unknown"; // model alias
			
 
				-    std::string model_url            = ""; // model url to download
			
 
				-    std::string hf_token             = ""; // HF token
			
 
				-    std::string hf_repo              = ""; // HF repo
			
 
				-    std::string hf_file              = ""; // HF file
			
 
				-    std::string prompt               = "";
			
 
				-    std::string prompt_file          = ""; // store the external prompt file name
			
 
				-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
			
 
				-    std::string input_prefix         = ""; // string to prefix user inputs with
			
 
				-    std::string input_suffix         = ""; // string to suffix user inputs with
			
 
				-    std::string logdir               = ""; // directory in which to save YAML log files
			
 
				-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
			
 
				-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
			
 
				-    std::string logits_file          = ""; // file for saving *all* logits
			
 
				-    std::string rpc_servers          = ""; // comma separated list of RPC servers
			
 
				+    struct gpt_sampler_params sparams;
			
 
				+
			
 
				+    std::string model                = ""; // model path                                                    // NOLINT
			
 
				+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
			
 
				+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
			
 
				+    std::string model_url            = ""; // model url to download                                         // NOLINT
			
 
				+    std::string hf_token             = ""; // HF token                                                      // NOLINT
			
 
				+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
			
 
				+    std::string hf_file              = ""; // HF file                                                       // NOLINT
			
 
				+    std::string prompt               = "";                                                                  // NOLINT
			
 
				+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
			
 
				+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
			
 
				+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
			
 
				+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
			
 
				+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
			
 
				+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
			
 
				+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
			
 
				+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
			
 
				+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
			
 
				 
			
 
				     std::vector<std::string> in_files;   // all input files
			
 
				     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
			
@@ -209,15 +271,15 @@ struct gpt_params {
 
				     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
			
 
				     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
			
 
				     bool flash_attn        = false; // flash attention
			
 
				+    bool no_perf           = false; // disable performance metrics
			
 
				+    bool ctx_shift         = true;  // context shift on inifinite text generation
			
 
				 
			
 
				     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
			
 
				-    bool ignore_eos        = false; // ignore generated EOS tokens
			
 
				     bool logits_all        = false; // return logits for all tokens in the batch
			
 
				     bool use_mmap          = true;  // use mmap for faster loads
			
 
				     bool use_mlock         = false; // use mlock to keep model in memory
			
 
				     bool verbose_prompt    = false; // print prompt tokens before generation
			
 
				     bool display_prompt    = true;  // print prompt before generation
			
 
				-    bool infill            = false; // use infill mode
			
 
				     bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
			
 
				     bool no_kv_offload     = false; // disable KV offloading
			
 
				     bool warmup            = true;  // warmup run
			
@@ -227,7 +289,7 @@ struct gpt_params {
 
				     std::string cache_type_v = "f16"; // KV cache data type for the V
			
 
				 
			
 
				     // multimodal models (see examples/llava)
			
 
				-    std::string mmproj = "";        // path to multimodal projector
			
 
				+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
			
 
				     std::vector<std::string> image; // path to image file(s)
			
 
				 
			
 
				     // embedding
			
@@ -235,6 +297,7 @@ struct gpt_params {
 
				     int32_t embd_normalize = 2;     // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
			
 
				     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
			
 
				     std::string embd_sep   = "\n";  // separator of embendings
			
 
				+    bool reranking         = false; // enable reranking support on server
			
 
				 
			
 
				     // server params
			
 
				     int32_t port           = 8080;         // server listens on this network port
			
@@ -243,15 +306,15 @@ struct gpt_params {
 
				     int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
			
 
				 
			
 
				     std::string hostname      = "127.0.0.1";
			
 
				-    std::string public_path   = "";
			
 
				-    std::string chat_template = "";
			
 
				-    std::string system_prompt = "";
			
 
				+    std::string public_path   = "";                                                                         // NOLINT
			
 
				+    std::string chat_template = "";                                                                         // NOLINT
			
 
				+    std::string system_prompt = "";                                                                         // NOLINT
			
 
				     bool enable_chat_template = true;
			
 
				 
			
 
				     std::vector<std::string> api_keys;
			
 
				 
			
 
				-    std::string ssl_file_key  = "";
			
 
				-    std::string ssl_file_cert = "";
			
 
				+    std::string ssl_file_key  = "";                                                                         // NOLINT
			
 
				+    std::string ssl_file_cert = "";                                                                         // NOLINT
			
 
				 
			
 
				     bool endpoint_slots   = true;
			
 
				     bool endpoint_metrics = false;
			
@@ -301,15 +364,14 @@ struct gpt_params {
 
				     bool spm_infill = false; // suffix/prefix/middle pattern for infill
			
 
				 
			
 
				     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
			
 
				-};
			
 
				 
			
 
				-void gpt_params_parse_from_env(gpt_params & params);
			
 
				-void gpt_params_handle_model_default(gpt_params & params);
			
 
				+    // batched-bench params
			
 
				+    bool batched_bench_output_jsonl = false;
			
 
				+};
			
 
				 
			
 
				-bool gpt_params_parse_ex   (int argc, char ** argv, gpt_params & params);
			
 
				-bool gpt_params_parse      (int argc, char ** argv, gpt_params & params);
			
 
				-bool gpt_params_find_arg   (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
			
 
				-void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
			
 
				+// call once at the start of a program if it uses libcommon
			
 
				+// initializes the logging system and prints info about the build
			
 
				+void gpt_init();
			
 
				 
			
 
				 std::string gpt_params_get_system_info(const gpt_params & params);
			
 
				 
			
@@ -346,6 +408,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 
				 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
			
 
				 void string_process_escapes(std::string & input);
			
 
				 
			
 
				+std::string string_from(bool value);
			
 
				+std::string string_from(const std::vector<int> & values);
			
 
				+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
			
 
				+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
			
 
				+
			
 
				 //
			
 
				 // Filesystem utils
			
 
				 //
			
--- a/llama/ggml-aarch64.c
+++ b/llama/ggml-aarch64.c
--- a/llama/ggml-aarch64.h
+++ b/llama/ggml-aarch64.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-alloc.c
+++ b/llama/ggml-alloc.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -320,6 +320,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
 
				     alloc->free_blocks[0].offset = 0;
			
 
				     alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
			
 
				     alloc->max_size = 0;
			
 
				+
			
 
				+#ifdef GGML_ALLOCATOR_DEBUG
			
 
				+    for (int i = 0; i < 1024; i++) {
			
 
				+        alloc->allocated_tensors[i].tensor = NULL;
			
 
				+    }
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
			
--- a/llama/ggml-alloc.h
+++ b/llama/ggml-alloc.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-backend-impl.h
+++ b/llama/ggml-backend-impl.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -64,15 +64,16 @@ extern "C" {
 
				     typedef void * ggml_backend_buffer_context_t;
			
 
				 
			
 
				     struct ggml_backend_buffer_i {
			
 
				-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
			
 
				-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
			
 
				-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
			
 
				-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
			
 
				-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
			
 
				-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
			
 
				+        const char * (*GGML_CALL get_name)      (ggml_backend_buffer_t buffer);
			
 
				+        void         (*GGML_CALL free_buffer)   (ggml_backend_buffer_t buffer);
			
 
				+        void *       (*GGML_CALL get_base)      (ggml_backend_buffer_t buffer);
			
 
				+        void         (*GGML_CALL init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
			
 
				+        void         (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
			
 
				+        void         (*GGML_CALL set_tensor)    (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				+        void         (*GGML_CALL get_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+        bool         (*GGML_CALL cpy_tensor)    (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
			
 
				+        void         (*GGML_CALL clear)         (ggml_backend_buffer_t buffer, uint8_t value);
			
 
				+        void         (*GGML_CALL reset)         (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
			
 
				     };
			
 
				 
			
 
				     struct ggml_backend_buffer {
			
--- a/llama/ggml-backend.c
+++ b/llama/ggml-backend.c
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -277,6 +277,22 @@ GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void *
 
				     buf->iface.get_tensor(buf, tensor, data, offset, size);
			
 
				 }
			
 
				 
			
 
				+GGML_API GGML_CALL void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
			
 
				+    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
			
 
				+
			
 
				+    GGML_ASSERT(buf != NULL && "tensor buffer not set");
			
 
				+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
			
 
				+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
			
 
				+
			
 
				+    if (!size) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
			
 
				+
			
 
				+    buf->iface.memset_tensor(buf, tensor, value, offset, size);
			
 
				+}
			
 
				+
			
 
				 void ggml_backend_synchronize(ggml_backend_t backend) {
			
 
				     if (backend->iface.synchronize == NULL) {
			
 
				         return;
			
@@ -600,6 +616,12 @@ GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t
 
				     free(buffer->context);
			
 
				 }
			
 
				 
			
 
				+GGML_CALL static void ggml_backend_cpu_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
			
 
				+    memset((char *)tensor->data + offset, value, size);
			
 
				+
			
 
				+    GGML_UNUSED(buffer);
			
 
				+}
			
 
				+
			
 
				 GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
			
 
				     memcpy((char *)tensor->data + offset, data, size);
			
 
				 
			
@@ -631,6 +653,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
 
				     /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
			
 
				     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
			
 
				     /* .init_tensor     = */ NULL, // no initialization required
			
 
				+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
			
 
				     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
			
 
				     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
			
 
				     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
			
@@ -644,6 +667,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 
				     /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
			
 
				     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
			
 
				     /* .init_tensor     = */ NULL, // no initialization required
			
 
				+    /* .memset_tensor   = */ ggml_backend_cpu_buffer_memset_tensor,
			
 
				     /* .set_tensor      = */ ggml_backend_cpu_buffer_set_tensor,
			
 
				     /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
			
 
				     /* .cpy_tensor      = */ ggml_backend_cpu_buffer_cpy_tensor,
			
@@ -858,6 +882,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
 
				                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
			
 
				         case GGML_OP_MUL_MAT:
			
 
				             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
			
 
				+        case GGML_OP_ROPE_BACK:
			
 
				+            return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
			
 
				+        case GGML_OP_IM2COL_BACK:
			
 
				+            return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
			
 
				         default:
			
 
				             return true;
			
 
				     }
			
@@ -1007,6 +1035,7 @@ static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(
 
				         /* .free_buffer     = */ ggml_backend_multi_buffer_free_buffer,
			
 
				         /* .get_base        = */ NULL,
			
 
				         /* .init_tensor     = */ NULL,
			
 
				+        /* .memset_tensor   = */ NULL,
			
 
				         /* .set_tensor      = */ NULL,
			
 
				         /* .get_tensor      = */ NULL,
			
 
				         /* .cpy_tensor      = */ NULL,
			
@@ -1196,6 +1225,11 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
 
				         }
			
 
				     }
			
 
				 
			
 
				+    if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
			
 
				+        // since the tensor is pre-allocated, it cannot be moved to another backend
			
 
				+        GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
			
 
				+    }
			
 
				+
			
 
				     // graph input
			
 
				     if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
			
 
				         cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
			
@@ -1675,7 +1709,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
				         sched->prev_leaf_backend_ids = tmp;
			
 
				     }
			
 
				 
			
 
				-    int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
			
 
				+    int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
			
 
				     if (sched->graph.size < graph_size) {
			
 
				         sched->graph.size = graph_size;
			
 
				         sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
			
@@ -1727,6 +1761,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
				             for (int c = 0; c < sched->n_copies; c++) {
			
 
				                 struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
			
 
				                 sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
			
 
				+                assert(graph_copy->size > graph_copy->n_leafs);
			
 
				                 graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
			
 
				             }
			
 
				         }
			
@@ -1740,6 +1775,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
				                 for (int c = 0; c < sched->n_copies; c++) {
			
 
				                     struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
			
 
				                     sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
			
 
				+                    assert(graph_copy->size > graph_copy->n_leafs);
			
 
				                     graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
			
 
				                 }
			
 
				             }
			
@@ -1750,6 +1786,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
 
				     for (int i = 0; i < graph->n_leafs; i++) {
			
 
				         struct ggml_tensor * leaf = graph->leafs[i];
			
 
				         sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
			
 
				+        assert(graph_copy->size > graph_copy->n_leafs);
			
 
				         graph_copy->leafs[graph_copy->n_leafs++] = leaf;
			
 
				     }
			
 
				 }
			
--- a/llama/ggml-backend.h
+++ b/llama/ggml-backend.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -92,6 +92,7 @@ extern "C" {
 
				     // "offset" refers to the offset of the tensor data for setting/getting data
			
 
				     GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
			
 
				     GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
			
 
				+    GGML_API GGML_CALL void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
			
 
				 
			
 
				     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
			
 
				 
			
@@ -148,7 +149,7 @@ extern "C" {
 
				     // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
			
 
				 
			
 
				     GGML_API size_t                     ggml_backend_reg_get_count(void);
			
 
				-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
			
 
				+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
			
 
				     GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
			
 
				     GGML_API const char *               ggml_backend_reg_get_name(size_t i);
			
 
				     GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
			
--- a/llama/ggml-blas.cpp
+++ b/llama/ggml-blas.cpp
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -26,6 +26,7 @@
 
				 
			
 
				 #ifdef GGML_USE_BLAS
			
 
				 
			
 
				+#include "ggml-impl.h"
			
 
				 #include "ggml-blas.h"
			
 
				 #include "ggml-backend-impl.h"
			
 
				 
			
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-common.h
+++ b/llama/ggml-common.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -253,6 +253,25 @@ typedef struct {
 
				 } block_q8_0x8;
			
 
				 static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
			
 
				 
			
 
				+//
			
 
				+// Ternary quantization
			
 
				+//
			
 
				+
			
 
				+// 1.6875 bpw
			
 
				+typedef struct {
			
 
				+    uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
			
 
				+    uint8_t qh[QK_K/64]; // 4 elements per byte
			
 
				+    ggml_half d;
			
 
				+} block_tq1_0;
			
 
				+static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
			
 
				+
			
 
				+// 2.0625 bpw
			
 
				+typedef struct {
			
 
				+    uint8_t qs[QK_K/4]; // 2 bits per element
			
 
				+    ggml_half d;
			
 
				+} block_tq2_0;
			
 
				+static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
			
 
				+
			
 
				 //
			
 
				 // Super-block quantization structures
			
 
				 //
			
@@ -387,6 +406,7 @@ typedef struct {
 
				 } block_iq3_s;
			
 
				 static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
			
 
				 
			
 
				+// 1.5625 bpw
			
 
				 typedef struct {
			
 
				     ggml_half d;
			
 
				     uint8_t  qs[QK_K/8];
			
--- a/llama/ggml-cpu-impl.h
+++ b/llama/ggml-cpu-impl.h
@@ -0,0 +1,640 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+// GGML CPU internal header
			
 
				+
			
 
				+#include "ggml.h"
			
 
				+#include "ggml-impl.h"
			
 
				+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
			
 
				+//#include <stddef.h>
			
 
				+#include <stdbool.h>
			
 
				+#include <string.h> // memcpy
			
 
				+#include <math.h>   // fabsf
			
 
				+
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+#if defined(_MSC_VER)
			
 
				+
			
 
				+#define m512bh(p) p
			
 
				+#define m512i(p) p
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define m512bh(p) (__m512bh)(p)
			
 
				+#define m512i(p) (__m512i)(p)
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/**
			
 
				+ * Converts brain16 to float32.
			
 
				+ *
			
 
				+ * The bfloat16 floating point format has the following structure:
			
 
				+ *
			
 
				+ *       ┌sign
			
 
				+ *       │
			
 
				+ *       │   ┌exponent
			
 
				+ *       │   │
			
 
				+ *       │   │      ┌mantissa
			
 
				+ *       │   │      │
			
 
				+ *       │┌──┴───┐┌─┴───┐
			
 
				+ *     0b0000000000000000 brain16
			
 
				+ *
			
 
				+ * Since bf16 has the same number of exponent bits as a 32bit float,
			
 
				+ * encoding and decoding numbers becomes relatively straightforward.
			
 
				+ *
			
 
				+ *       ┌sign
			
 
				+ *       │
			
 
				+ *       │   ┌exponent
			
 
				+ *       │   │
			
 
				+ *       │   │      ┌mantissa
			
 
				+ *       │   │      │
			
 
				+ *       │┌──┴───┐┌─┴───────────────────┐
			
 
				+ *     0b00000000000000000000000000000000 IEEE binary32
			
 
				+ *
			
 
				+ * For comparison, the standard fp16 format has fewer exponent bits.
			
 
				+ *
			
 
				+ *       ┌sign
			
 
				+ *       │
			
 
				+ *       │  ┌exponent
			
 
				+ *       │  │
			
 
				+ *       │  │    ┌mantissa
			
 
				+ *       │  │    │
			
 
				+ *       │┌─┴─┐┌─┴──────┐
			
 
				+ *     0b0000000000000000 IEEE binary16
			
 
				+ *
			
 
				+ * @see IEEE 754-2008
			
 
				+ */
			
 
				+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
			
 
				+    union {
			
 
				+        float f;
			
 
				+        uint32_t i;
			
 
				+    } u;
			
 
				+    u.i = (uint32_t)h.bits << 16;
			
 
				+    return u.f;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Converts float32 to brain16.
			
 
				+ *
			
 
				+ * This is binary identical with Google Brain float conversion.
			
 
				+ * Floats shall round to nearest even, and NANs shall be quiet.
			
 
				+ * Subnormals aren't flushed to zero, except perhaps when used.
			
 
				+ * This code should vectorize nicely if using modern compilers.
			
 
				+ */
			
 
				+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
			
 
				+    ggml_bf16_t h;
			
 
				+    union {
			
 
				+        float f;
			
 
				+        uint32_t i;
			
 
				+    } u;
			
 
				+    u.f = s;
			
 
				+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
			
 
				+        h.bits = (u.i >> 16) | 64; /* force to quiet */
			
 
				+        return h;
			
 
				+    }
			
 
				+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
			
 
				+    return h;
			
 
				+}
			
 
				+
			
 
				+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
			
 
				+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
			
 
				+
			
 
				+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
			
 
				+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
			
 
				+#ifndef __FMA__
			
 
				+#define __FMA__
			
 
				+#endif
			
 
				+#ifndef __F16C__
			
 
				+#define __F16C__
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
			
 
				+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
			
 
				+#ifndef __SSE3__
			
 
				+#define __SSE3__
			
 
				+#endif
			
 
				+#ifndef __SSSE3__
			
 
				+#define __SSSE3__
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__ARM_FEATURE_SVE)
			
 
				+#include <arm_sve.h>
			
 
				+#include <sys/prctl.h>
			
 
				+#endif
			
 
				+
			
 
				+// 16-bit float
			
 
				+// on Arm, we use __fp16
			
 
				+// on x86, we use uint16_t
			
 
				+#if defined(__ARM_NEON)
			
 
				+
			
 
				+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
			
 
				+//
			
 
				+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
			
 
				+//
			
 
				+#include <arm_neon.h>
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+
			
 
				+typedef uint16_t ggml_fp16_internal_t;
			
 
				+
			
 
				+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+typedef __fp16 ggml_fp16_internal_t;
			
 
				+
			
 
				+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
			
 
				+
			
 
				+#endif // _MSC_VER
			
 
				+
			
 
				+#if !defined(__aarch64__)
			
 
				+
			
 
				+// 32-bit ARM compatibility
			
 
				+
			
 
				+// vaddlvq_s16
			
 
				+// vpaddq_s16
			
 
				+// vpaddq_s32
			
 
				+// vaddvq_s32
			
 
				+// vaddvq_f32
			
 
				+// vmaxvq_f32
			
 
				+// vcvtnq_s32_f32
			
 
				+// vzip1_u8
			
 
				+// vzip2_u8
			
 
				+
			
 
				+inline static int32_t vaddlvq_s16(int16x8_t v) {
			
 
				+    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
			
 
				+    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
			
 
				+}
			
 
				+
			
 
				+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
			
 
				+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
			
 
				+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
			
 
				+    return vcombine_s16(a0, b0);
			
 
				+}
			
 
				+
			
 
				+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
			
 
				+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
			
 
				+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
			
 
				+    return vcombine_s32(a0, b0);
			
 
				+}
			
 
				+
			
 
				+inline static int32_t vaddvq_s32(int32x4_t v) {
			
 
				+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
			
 
				+}
			
 
				+
			
 
				+inline static float vaddvq_f32(float32x4_t v) {
			
 
				+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
			
 
				+}
			
 
				+
			
 
				+inline static float vmaxvq_f32(float32x4_t v) {
			
 
				+    return
			
 
				+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
			
 
				+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
			
 
				+}
			
 
				+
			
 
				+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
			
 
				+    int32x4_t res;
			
 
				+
			
 
				+    res[0] = roundf(vgetq_lane_f32(v, 0));
			
 
				+    res[1] = roundf(vgetq_lane_f32(v, 1));
			
 
				+    res[2] = roundf(vgetq_lane_f32(v, 2));
			
 
				+    res[3] = roundf(vgetq_lane_f32(v, 3));
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
			
 
				+    uint8x8_t res;
			
 
				+
			
 
				+    res[0] = a[0]; res[1] = b[0];
			
 
				+    res[2] = a[1]; res[3] = b[1];
			
 
				+    res[4] = a[2]; res[5] = b[2];
			
 
				+    res[6] = a[3]; res[7] = b[3];
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
			
 
				+    uint8x8_t res;
			
 
				+
			
 
				+    res[0] = a[4]; res[1] = b[4];
			
 
				+    res[2] = a[5]; res[3] = b[5];
			
 
				+    res[4] = a[6]; res[5] = b[6];
			
 
				+    res[6] = a[7]; res[7] = b[7];
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+// vld1q_s16_x2
			
 
				+// vld1q_u8_x2
			
 
				+// vld1q_u8_x4
			
 
				+// vld1q_s8_x2
			
 
				+// vld1q_s8_x4
			
 
				+// TODO: double-check these work correctly
			
 
				+
			
 
				+typedef struct ggml_int16x8x2_t {
			
 
				+    int16x8_t val[2];
			
 
				+} ggml_int16x8x2_t;
			
 
				+
			
 
				+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
			
 
				+    ggml_int16x8x2_t res;
			
 
				+
			
 
				+    res.val[0] = vld1q_s16(ptr + 0);
			
 
				+    res.val[1] = vld1q_s16(ptr + 8);
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+typedef struct ggml_uint8x16x2_t {
			
 
				+    uint8x16_t val[2];
			
 
				+} ggml_uint8x16x2_t;
			
 
				+
			
 
				+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
			
 
				+    ggml_uint8x16x2_t res;
			
 
				+
			
 
				+    res.val[0] = vld1q_u8(ptr + 0);
			
 
				+    res.val[1] = vld1q_u8(ptr + 16);
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+typedef struct ggml_uint8x16x4_t {
			
 
				+    uint8x16_t val[4];
			
 
				+} ggml_uint8x16x4_t;
			
 
				+
			
 
				+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
			
 
				+    ggml_uint8x16x4_t res;
			
 
				+
			
 
				+    res.val[0] = vld1q_u8(ptr + 0);
			
 
				+    res.val[1] = vld1q_u8(ptr + 16);
			
 
				+    res.val[2] = vld1q_u8(ptr + 32);
			
 
				+    res.val[3] = vld1q_u8(ptr + 48);
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+typedef struct ggml_int8x16x2_t {
			
 
				+    int8x16_t val[2];
			
 
				+} ggml_int8x16x2_t;
			
 
				+
			
 
				+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
			
 
				+    ggml_int8x16x2_t res;
			
 
				+
			
 
				+    res.val[0] = vld1q_s8(ptr + 0);
			
 
				+    res.val[1] = vld1q_s8(ptr + 16);
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+typedef struct ggml_int8x16x4_t {
			
 
				+    int8x16_t val[4];
			
 
				+} ggml_int8x16x4_t;
			
 
				+
			
 
				+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
			
 
				+    ggml_int8x16x4_t res;
			
 
				+
			
 
				+    res.val[0] = vld1q_s8(ptr + 0);
			
 
				+    res.val[1] = vld1q_s8(ptr + 16);
			
 
				+    res.val[2] = vld1q_s8(ptr + 32);
			
 
				+    res.val[3] = vld1q_s8(ptr + 48);
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+// NOTE: not tested
			
 
				+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
			
 
				+    int8x16_t res;
			
 
				+
			
 
				+    res[ 0] = a[b[ 0]];
			
 
				+    res[ 1] = a[b[ 1]];
			
 
				+    res[ 2] = a[b[ 2]];
			
 
				+    res[ 3] = a[b[ 3]];
			
 
				+    res[ 4] = a[b[ 4]];
			
 
				+    res[ 5] = a[b[ 5]];
			
 
				+    res[ 6] = a[b[ 6]];
			
 
				+    res[ 7] = a[b[ 7]];
			
 
				+    res[ 8] = a[b[ 8]];
			
 
				+    res[ 9] = a[b[ 9]];
			
 
				+    res[10] = a[b[10]];
			
 
				+    res[11] = a[b[11]];
			
 
				+    res[12] = a[b[12]];
			
 
				+    res[13] = a[b[13]];
			
 
				+    res[14] = a[b[14]];
			
 
				+    res[15] = a[b[15]];
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+// NOTE: not tested
			
 
				+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
			
 
				+    uint8x16_t res;
			
 
				+
			
 
				+    res[ 0] = a[b[ 0]];
			
 
				+    res[ 1] = a[b[ 1]];
			
 
				+    res[ 2] = a[b[ 2]];
			
 
				+    res[ 3] = a[b[ 3]];
			
 
				+    res[ 4] = a[b[ 4]];
			
 
				+    res[ 5] = a[b[ 5]];
			
 
				+    res[ 6] = a[b[ 6]];
			
 
				+    res[ 7] = a[b[ 7]];
			
 
				+    res[ 8] = a[b[ 8]];
			
 
				+    res[ 9] = a[b[ 9]];
			
 
				+    res[10] = a[b[10]];
			
 
				+    res[11] = a[b[11]];
			
 
				+    res[12] = a[b[12]];
			
 
				+    res[13] = a[b[13]];
			
 
				+    res[14] = a[b[14]];
			
 
				+    res[15] = a[b[15]];
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define ggml_int16x8x2_t  int16x8x2_t
			
 
				+#define ggml_uint8x16x2_t uint8x16x2_t
			
 
				+#define ggml_uint8x16x4_t uint8x16x4_t
			
 
				+#define ggml_int8x16x2_t  int8x16x2_t
			
 
				+#define ggml_int8x16x4_t  int8x16x4_t
			
 
				+
			
 
				+#define ggml_vld1q_s16_x2 vld1q_s16_x2
			
 
				+#define ggml_vld1q_u8_x2  vld1q_u8_x2
			
 
				+#define ggml_vld1q_u8_x4  vld1q_u8_x4
			
 
				+#define ggml_vld1q_s8_x2  vld1q_s8_x2
			
 
				+#define ggml_vld1q_s8_x4  vld1q_s8_x4
			
 
				+#define ggml_vqtbl1q_s8   vqtbl1q_s8
			
 
				+#define ggml_vqtbl1q_u8   vqtbl1q_u8
			
 
				+
			
 
				+#endif // !defined(__aarch64__)
			
 
				+
			
 
				+#if !defined(__ARM_FEATURE_DOTPROD)
			
 
				+
			
 
				+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
			
 
				+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
			
 
				+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
			
 
				+
			
 
				+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
			
 
				+
			
 
				+#endif // !defined(__ARM_FEATURE_DOTPROD)
			
 
				+
			
 
				+#endif // defined(__ARM_NEON)
			
 
				+
			
 
				+#if defined(__ARM_NEON) && !defined(_MSC_VER)
			
 
				+
			
 
				+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
			
 
				+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
			
 
				+
			
 
				+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
			
 
				+
			
 
				+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
			
 
				+    ggml_fp16_internal_t tmp;
			
 
				+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
			
 
				+    return (float)tmp;
			
 
				+}
			
 
				+
			
 
				+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
			
 
				+    ggml_fp16_t res;
			
 
				+    ggml_fp16_internal_t tmp = f;
			
 
				+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#ifdef __wasm_simd128__
			
 
				+#include <wasm_simd128.h>
			
 
				+#else
			
 
				+#ifdef __POWER9_VECTOR__
			
 
				+#include <altivec.h>
			
 
				+#undef bool
			
 
				+#define bool _Bool
			
 
				+#else
			
 
				+#if defined(_MSC_VER) || defined(__MINGW32__)
			
 
				+#include <intrin.h>
			
 
				+#else
			
 
				+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
			
 
				+#if !defined(__riscv)
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+#endif
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __riscv_v_intrinsic
			
 
				+#include <riscv_vector.h>
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__loongarch64)
			
 
				+#if defined(__loongarch_asx)
			
 
				+#include <lasxintrin.h>
			
 
				+#endif
			
 
				+#if defined(__loongarch_sx)
			
 
				+#include <lsxintrin.h>
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__loongarch_asx)
			
 
				+
			
 
				+typedef union {
			
 
				+    int32_t i;
			
 
				+    float f;
			
 
				+} ft_union;
			
 
				+
			
 
				+/* float type data load instructions */
			
 
				+static __m128 __lsx_vreplfr2vr_s(float val) {
			
 
				+    ft_union fi_tmpval = {.f = val};
			
 
				+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
			
 
				+}
			
 
				+
			
 
				+static __m256 __lasx_xvreplfr2vr_s(float val) {
			
 
				+    ft_union fi_tmpval = {.f = val};
			
 
				+    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __F16C__
			
 
				+
			
 
				+#ifdef _MSC_VER
			
 
				+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
			
 
				+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
			
 
				+#else
			
 
				+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
			
 
				+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
			
 
				+#endif
			
 
				+
			
 
				+#elif defined(__POWER9_VECTOR__)
			
 
				+
			
 
				+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
			
 
				+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
			
 
				+/* the inline asm below is about 12% faster than the lookup method */
			
 
				+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
			
 
				+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
			
 
				+
			
 
				+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
			
 
				+    register float f;
			
 
				+    register double d;
			
 
				+    __asm__(
			
 
				+        "mtfprd %0,%2\n"
			
 
				+        "xscvhpdp %0,%0\n"
			
 
				+        "frsp %1,%0\n" :
			
 
				+        /* temp */ "=d"(d),
			
 
				+        /* out */  "=f"(f):
			
 
				+        /* in */   "r"(h));
			
 
				+    return f;
			
 
				+}
			
 
				+
			
 
				+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
			
 
				+    register double d;
			
 
				+    register ggml_fp16_t r;
			
 
				+    __asm__( /* xscvdphp can work on double or single precision */
			
 
				+        "xscvdphp %0,%2\n"
			
 
				+        "mffprd %1,%0\n" :
			
 
				+        /* temp */ "=d"(d),
			
 
				+        /* out */  "=r"(r):
			
 
				+        /* in */   "f"(f));
			
 
				+    return r;
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+// FP16 <-> FP32
			
 
				+// ref: https://github.com/Maratyszcza/FP16
			
 
				+
			
 
				+static inline float fp32_from_bits(uint32_t w) {
			
 
				+    union {
			
 
				+        uint32_t as_bits;
			
 
				+        float as_value;
			
 
				+    } fp32;
			
 
				+    fp32.as_bits = w;
			
 
				+    return fp32.as_value;
			
 
				+}
			
 
				+
			
 
				+static inline uint32_t fp32_to_bits(float f) {
			
 
				+    union {
			
 
				+        float as_value;
			
 
				+        uint32_t as_bits;
			
 
				+    } fp32;
			
 
				+    fp32.as_value = f;
			
 
				+    return fp32.as_bits;
			
 
				+}
			
 
				+
			
 
				+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
			
 
				+    const uint32_t w = (uint32_t) h << 16;
			
 
				+    const uint32_t sign = w & UINT32_C(0x80000000);
			
 
				+    const uint32_t two_w = w + w;
			
 
				+
			
 
				+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
			
 
				+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
			
 
				+    const float exp_scale = 0x1.0p-112f;
			
 
				+#else
			
 
				+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
			
 
				+#endif
			
 
				+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
			
 
				+
			
 
				+    const uint32_t magic_mask = UINT32_C(126) << 23;
			
 
				+    const float magic_bias = 0.5f;
			
 
				+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
			
 
				+
			
 
				+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
			
 
				+    const uint32_t result = sign |
			
 
				+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
			
 
				+    return fp32_from_bits(result);
			
 
				+}
			
 
				+
			
 
				+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
			
 
				+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
			
 
				+    const float scale_to_inf = 0x1.0p+112f;
			
 
				+    const float scale_to_zero = 0x1.0p-110f;
			
 
				+#else
			
 
				+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
			
 
				+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
			
 
				+#endif
			
 
				+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
			
 
				+
			
 
				+    const uint32_t w = fp32_to_bits(f);
			
 
				+    const uint32_t shl1_w = w + w;
			
 
				+    const uint32_t sign = w & UINT32_C(0x80000000);
			
 
				+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
			
 
				+    if (bias < UINT32_C(0x71000000)) {
			
 
				+        bias = UINT32_C(0x71000000);
			
 
				+    }
			
 
				+
			
 
				+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
			
 
				+    const uint32_t bits = fp32_to_bits(base);
			
 
				+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
			
 
				+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
			
 
				+    const uint32_t nonsign = exp_bits + mantissa_bits;
			
 
				+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
			
 
				+}
			
 
				+
			
 
				+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
			
 
				+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
			
 
				+
			
 
				+#endif // __F16C__
			
 
				+
			
 
				+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
			
 
				+
			
 
				+#ifdef __ARM_FEATURE_SVE
			
 
				+#include <arm_sve.h>
			
 
				+#endif // __ARM_FEATURE_SVE
			
 
				+
			
 
				+// precomputed f32 table for f16 (256 KB)
			
 
				+// defined in ggml.c, initialized in ggml_init()
			
 
				+extern float ggml_table_f32_f16[1 << 16];
			
 
				+
			
 
				+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
			
 
				+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
			
 
				+// This is also true for POWER9.
			
 
				+#if !defined(GGML_FP16_TO_FP32)
			
 
				+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
			
 
				+    uint16_t s;
			
 
				+    memcpy(&s, &f, sizeof(uint16_t));
			
 
				+    return ggml_table_f32_f16[s];
			
 
				+}
			
 
				+
			
 
				+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
			
 
				+#endif
			
 
				+
			
 
				+#if !defined(GGML_FP32_TO_FP16)
			
 
				+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -25,7 +25,7 @@
 
				  */
			
 
				 
			
 
				 #include "ggml-cuda.h"
			
 
				-#include "ggml.h"
			
 
				+#include "ggml-impl.h"
			
 
				 #include "ggml-backend-impl.h"
			
 
				 
			
 
				 #include "ggml-cuda/common.cuh"
			
@@ -47,16 +47,20 @@
 
				 #include "ggml-cuda/mmq.cuh"
			
 
				 #include "ggml-cuda/mmvq.cuh"
			
 
				 #include "ggml-cuda/norm.cuh"
			
 
				+#include "ggml-cuda/opt-step-adamw.cuh"
			
 
				+#include "ggml-cuda/out-prod.cuh"
			
 
				 #include "ggml-cuda/pad.cuh"
			
 
				 #include "ggml-cuda/pool2d.cuh"
			
 
				 #include "ggml-cuda/quantize.cuh"
			
 
				 #include "ggml-cuda/rope.cuh"
			
 
				 #include "ggml-cuda/scale.cuh"
			
 
				 #include "ggml-cuda/softmax.cuh"
			
 
				+#include "ggml-cuda/sum.cuh"
			
 
				 #include "ggml-cuda/sumrows.cuh"
			
 
				 #include "ggml-cuda/tsembd.cuh"
			
 
				 #include "ggml-cuda/unary.cuh"
			
 
				 #include "ggml-cuda/upscale.cuh"
			
 
				+#include "ggml-cuda/rwkv-wkv.cuh"
			
 
				 
			
 
				 #include <algorithm>
			
 
				 #include <array>
			
@@ -158,7 +162,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 
				     return res;
			
 
				 #else
			
 
				 
			
 
				-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
			
 
				+#if !defined(GGML_USE_HIPBLAS)
			
 
				     cudaError_t err;
			
 
				     if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
			
 
				     {
			
@@ -171,7 +175,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
 
				     return err;
			
 
				 #else
			
 
				     return cudaMalloc(ptr, size);
			
 
				-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
			
 
				+#endif // !defined(GGML_USE_HIPBLAS)
			
 
				 
			
 
				 #endif
			
 
				 }
			
@@ -209,7 +213,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
				     for (int id = 0; id < info.device_count; ++id) {
			
 
				         int device_vmm = 0;
			
 
				 
			
 
				-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				         CUdevice device;
			
 
				         CU_CHECK(cuDeviceGet(&device, id));
			
 
				         CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
			
@@ -221,7 +225,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
 
				             alloc_prop.location.id = id;
			
 
				             CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
			
 
				         }
			
 
				-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				         info.devices[id].vmm = !!device_vmm;
			
 
				 
			
 
				         cudaDeviceProp prop;
			
@@ -357,7 +361,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 
				 };
			
 
				 
			
 
				 // pool with virtual memory
			
 
				-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
			
 
				     static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
			
 
				 
			
@@ -451,14 +455,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 
				         GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
			
 
				     }
			
 
				 };
			
 
				-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				 
			
 
				 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
			
 
				-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				     if (ggml_cuda_info().devices[device].vmm) {
			
 
				         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
			
 
				     }
			
 
				-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
			
 
				+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
			
 
				     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
			
 
				 }
			
 
				 
			
@@ -522,6 +526,14 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
 
				     }
			
 
				 }
			
 
				 
			
 
				+GGML_CALL static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
			
 
				+    ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
			
 
				+
			
 
				+    ggml_cuda_set_device(ctx->device);
			
 
				+    CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + offset, value, size, cudaStreamPerThread));
			
 
				+    CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
			
 
				+}
			
 
				+
			
 
				 GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
			
 
				     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
			
 
				 
			
@@ -573,6 +585,7 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 
				     /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
			
 
				     /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
			
 
				     /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
			
 
				+    /* .memset_tensor   = */ ggml_backend_cuda_buffer_memset_tensor,
			
 
				     /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
			
 
				     /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
			
 
				     /* .cpy_tensor      = */ ggml_backend_cuda_buffer_cpy_tensor,
			
@@ -889,6 +902,7 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 
				     /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
			
 
				     /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
			
 
				     /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
			
 
				+    /* .memset_tensor   = */ NULL,
			
 
				     /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
			
 
				     /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
			
 
				     /* .cpy_tensor      = */ NULL,
			
@@ -2197,6 +2211,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_REPEAT:
			
 
				             ggml_cuda_op_repeat(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_REPEAT_BACK:
			
 
				+            ggml_cuda_op_repeat_back(ctx, dst);
			
 
				+            break;
			
 
				         case GGML_OP_GET_ROWS:
			
 
				             ggml_cuda_op_get_rows(ctx, dst);
			
 
				             break;
			
@@ -2210,6 +2227,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				             ggml_cuda_dup(ctx, dst);
			
 
				             break;
			
 
				         case GGML_OP_ADD:
			
 
				+        case GGML_OP_ADD1: // TODO: more efficient implementation
			
 
				             ggml_cuda_op_add(ctx, dst);
			
 
				             break;
			
 
				         case GGML_OP_SUB:
			
@@ -2226,6 +2244,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				             break;
			
 
				         case GGML_OP_UNARY:
			
 
				             switch (ggml_get_unary_op(dst)) {
			
 
				+                case GGML_UNARY_OP_NEG:
			
 
				+                    ggml_cuda_op_neg(ctx, dst);
			
 
				+                    break;
			
 
				+                case GGML_UNARY_OP_STEP:
			
 
				+                    ggml_cuda_op_step(ctx, dst);
			
 
				+                    break;
			
 
				                 case GGML_UNARY_OP_GELU:
			
 
				                     ggml_cuda_op_gelu(ctx, dst);
			
 
				                     break;
			
@@ -2250,6 +2274,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				                 case GGML_UNARY_OP_HARDSWISH:
			
 
				                     ggml_cuda_op_hardswish(ctx, dst);
			
 
				                     break;
			
 
				+                case GGML_UNARY_OP_EXP:
			
 
				+                    ggml_cuda_op_exp(ctx, dst);
			
 
				+                    break;
			
 
				                 default:
			
 
				                     return false;
			
 
				             }
			
@@ -2292,6 +2319,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_MUL_MAT_ID:
			
 
				             ggml_cuda_mul_mat_id(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_OUT_PROD:
			
 
				+            ggml_cuda_out_prod(ctx, dst);
			
 
				+            break;
			
 
				         case GGML_OP_SCALE:
			
 
				             ggml_cuda_op_scale(ctx, dst);
			
 
				             break;
			
@@ -2334,6 +2364,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_POOL_2D:
			
 
				             ggml_cuda_op_pool2d(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_SUM:
			
 
				+            ggml_cuda_op_sum(ctx, dst);
			
 
				+            break;
			
 
				         case GGML_OP_SUM_ROWS:
			
 
				             ggml_cuda_op_sum_rows(ctx, dst);
			
 
				             break;
			
@@ -2348,6 +2381,15 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
 
				         case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				             ggml_cuda_cross_entropy_loss(ctx, dst);
			
 
				             break;
			
 
				+        case GGML_OP_RWKV_WKV:
			
 
				+            ggml_cuda_op_rwkv_wkv(ctx, dst);
			
 
				+            break;
			
 
				+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
			
 
				+            ggml_cuda_cross_entropy_loss_back(ctx, dst);
			
 
				+            break;
			
 
				+        case GGML_OP_OPT_STEP_ADAMW:
			
 
				+            ggml_cuda_opt_step_adamw(ctx, dst);
			
 
				+            break;
			
 
				         default:
			
 
				             return false;
			
 
				     }
			
@@ -2475,6 +2517,7 @@ static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_p
 
				     for (int i = 0; i < GGML_MAX_SRC; i++) {
			
 
				         graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
			
 
				     }
			
 
				+    memcpy(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS);
			
 
				 }
			
 
				 
			
 
				 static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
			
@@ -2506,6 +2549,12 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
 
				             return false;
			
 
				         }
			
 
				     }
			
 
				+
			
 
				+    if (node->op == GGML_OP_SCALE &&
			
 
				+        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				     return true;
			
 
				 }
			
 
				 
			
@@ -2576,7 +2625,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
				         for (int i = 0; i < cgraph->n_nodes; i++) {
			
 
				             ggml_tensor * node = cgraph->nodes[i];
			
 
				 
			
 
				-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
			
 
				+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
			
 
				                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
			
 
				 #ifndef NDEBUG
			
 
				                 GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
			
@@ -2604,8 +2657,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
				                 cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
			
 
				                 // store a pointer to each copy op CUDA kernel to identify it later
			
 
				                 void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
			
 
				-                if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
			
 
				-                    ggml_cuda_cpy_fn_ptrs.push_back(ptr);
			
 
				+                if (!ptr) {
			
 
				+                    use_cuda_graph = false;
			
 
				+#ifndef NDEBUG
			
 
				+                    GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
			
 
				+#endif
			
 
				+                } else {
			
 
				+                    if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
			
 
				+                        ggml_cuda_cpy_fn_ptrs.push_back(ptr);
			
 
				+                    }
			
 
				                 }
			
 
				             }
			
 
				 
			
@@ -2706,7 +2766,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
				             // First call with null argument gets number of nodes in graph
			
 
				             CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
			
 
				             // Subsequent call with non-null argument gets nodes
			
 
				+            cuda_ctx->cuda_graph->nodes.clear();
			
 
				             cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
			
 
				+            cuda_ctx->cuda_graph->params.clear();
			
 
				             cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
			
 
				             if (cuda_ctx->cuda_graph->num_nodes > 0) {
			
 
				                 CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
			
@@ -2773,6 +2835,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				     switch (op->op) {
			
 
				         case GGML_OP_UNARY:
			
 
				             switch (ggml_get_unary_op(op)) {
			
 
				+                case GGML_UNARY_OP_NEG:
			
 
				+                case GGML_UNARY_OP_STEP:
			
 
				                 case GGML_UNARY_OP_GELU:
			
 
				                 case GGML_UNARY_OP_SILU:
			
 
				                 case GGML_UNARY_OP_RELU:
			
@@ -2781,6 +2845,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				                 case GGML_UNARY_OP_HARDSWISH:
			
 
				                 case GGML_UNARY_OP_GELU_QUICK:
			
 
				                 case GGML_UNARY_OP_TANH:
			
 
				+                case GGML_UNARY_OP_EXP:
			
 
				                     return ggml_is_contiguous(op->src[0]);
			
 
				                 default:
			
 
				                     return false;
			
@@ -2797,6 +2862,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				                 if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
			
 
				                     return false;
			
 
				                 }
			
 
				+#ifdef GGML_USE_MUSA
			
 
				+                if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 &&
			
 
				+                    !ggml_is_transposed(a) && !ggml_is_transposed(b)) {
			
 
				+                    return false;
			
 
				+                }
			
 
				+#endif // GGML_USE_MUSA
			
 
				                 switch (a->type) {
			
 
				                     case GGML_TYPE_F32:
			
 
				                     case GGML_TYPE_F16:
			
@@ -2820,11 +2891,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				                     case GGML_TYPE_IQ3_XXS:
			
 
				                     case GGML_TYPE_IQ4_NL:
			
 
				                     case GGML_TYPE_IQ4_XS:
			
 
				+#ifdef GGML_USE_MUSA
			
 
				+                        if (a->type == GGML_TYPE_Q3_K) {
			
 
				+                            return false;
			
 
				+                        }
			
 
				+#endif // GGML_USE_MUSA
			
 
				                         return true;
			
 
				                     default:
			
 
				                         return false;
			
 
				                 }
			
 
				             } break;
			
 
				+        case GGML_OP_OUT_PROD:
			
 
				+            return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
			
 
				         case GGML_OP_GET_ROWS:
			
 
				             {
			
 
				                 switch (op->src[0]->type) {
			
@@ -2853,6 +2931,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
			
 
				                     return true;
			
 
				                 }
			
 
				+                if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
			
 
				+                    return true;
			
 
				+                }
			
 
				                 if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
			
 
				                     return true;
			
 
				                 }
			
@@ -2874,10 +2955,19 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				                 if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
			
 
				                     return true;
			
 
				                 }
			
 
				+                if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
			
 
				+                    return true;
			
 
				+                }
			
 
				                 return false;
			
 
				             } break;
			
 
				         case GGML_OP_DUP:
			
 
				         case GGML_OP_REPEAT:
			
 
				+            {
			
 
				+                ggml_type src0_type = op->src[0]->type;
			
 
				+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
			
 
				+            } break;
			
 
				+        case GGML_OP_REPEAT_BACK:
			
 
				+                return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
			
 
				         case GGML_OP_CONCAT:
			
 
				             {
			
 
				                 ggml_type src0_type = op->src[0]->type;
			
@@ -2899,6 +2989,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				         case GGML_OP_TRANSPOSE:
			
 
				         case GGML_OP_NORM:
			
 
				         case GGML_OP_ADD:
			
 
				+        case GGML_OP_ADD1:
			
 
				         case GGML_OP_SUB:
			
 
				         case GGML_OP_MUL:
			
 
				         case GGML_OP_DIV:
			
@@ -2909,14 +3000,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				         case GGML_OP_SIN:
			
 
				         case GGML_OP_COS:
			
 
				         case GGML_OP_CLAMP:
			
 
				+            return true;
			
 
				         case GGML_OP_CONT:
			
 
				+            return op->src[0]->type != GGML_TYPE_BF16;
			
 
				         case GGML_OP_DIAG_MASK_INF:
			
 
				         case GGML_OP_SOFT_MAX:
			
 
				             return true;
			
 
				         case GGML_OP_ROPE:
			
 
				             return ggml_is_contiguous(op->src[0]);
			
 
				         case GGML_OP_IM2COL:
			
 
				+            return op->src[0]->type == GGML_TYPE_F16;
			
 
				         case GGML_OP_POOL_2D:
			
 
				+        case GGML_OP_SUM:
			
 
				         case GGML_OP_SUM_ROWS:
			
 
				         case GGML_OP_ARGSORT:
			
 
				         case GGML_OP_ACC:
			
@@ -2926,22 +3021,28 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
 
				         case GGML_OP_ARANGE:
			
 
				         case GGML_OP_TIMESTEP_EMBEDDING:
			
 
				         case GGML_OP_LEAKY_RELU:
			
 
				+        case GGML_OP_RWKV_WKV:
			
 
				             return true;
			
 
				-        case GGML_OP_FLASH_ATTN_EXT:
			
 
				-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				-            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
			
 
				-#else
			
 
				+        case GGML_OP_FLASH_ATTN_EXT: {
			
 
				+#ifndef FLASH_ATTN_AVAILABLE
			
 
				+            return false;
			
 
				+#endif
			
 
				+            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
			
 
				+                return true;
			
 
				+            }
			
 
				             if (op->src[0]->ne[0] == 128) {
			
 
				                 return true;
			
 
				             }
			
 
				-            if (op->src[0]->ne[0] ==  64 && op->src[1]->type == GGML_TYPE_F16) {
			
 
				+            if (op->src[0]->ne[0] == 256 && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16) {
			
 
				                 return true;
			
 
				             }
			
 
				-            return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA &&
			
 
				-                op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
			
 
				+            const int cc = ggml_cuda_info().devices[cuda_ctx->device].cc;
			
 
				+            return cc >= CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
			
 
				+        }
			
 
				         case GGML_OP_CROSS_ENTROPY_LOSS:
			
 
				+        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
			
 
				+        case GGML_OP_OPT_STEP_ADAMW:
			
 
				             return true;
			
 
				-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
			
 
				         default:
			
 
				             return false;
			
 
				     }
			
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cu
+++ b/llama/ggml-cuda/acc.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cu
+++ b/llama/ggml-cuda/argsort.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/binbcast.cu
+++ b/llama/ggml-cuda/binbcast.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -25,6 +25,7 @@
 
				  */
			
 
				 
			
 
				 #include "binbcast.cuh"
			
 
				+#include <cstdint>
			
 
				 
			
 
				 static __device__ __forceinline__ float op_repeat(const float a, const float b) {
			
 
				     return b;
			
@@ -116,6 +117,30 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
 
				     dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
			
 
				 }
			
 
				 
			
 
				+template <typename T>
			
 
				+static __global__ void k_repeat_back(
			
 
				+    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
			
 
				+    const int64_t ne0, const int64_t ne1, const int64_t ne2) {
			
 
				+
			
 
				+    const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+    const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
			
 
				+    const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
			
 
				+
			
 
				+    if (tid0 >= ne0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    T sum = 0;
			
 
				+    for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
			
 
				+        for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
			
 
				+            for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
			
 
				+                sum += src[i2*ne01*ne00 + i1*ne00 + i0];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
			
 
				+}
			
 
				+
			
 
				 template<float (*bin_op)(const float, const float)>
			
 
				 struct bin_bcast_cuda {
			
 
				     template<typename src0_t, typename src1_t, typename dst_t>
			
@@ -273,6 +298,16 @@ struct bin_bcast_cuda {
 
				     }
			
 
				 };
			
 
				 
			
 
				+template <typename T>
			
 
				+static void repeat_back_cuda(
			
 
				+    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
			
 
				+    const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
			
 
				+
			
 
				+    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
			
 
				+    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
			
 
				+}
			
 
				+
			
 
				 template<class op>
			
 
				 static void ggml_cuda_op_bin_bcast(
			
 
				     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
			
@@ -312,3 +347,35 @@ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
			
 
				 }
			
 
				+
			
 
				+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == dst->type);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(dst));
			
 
				+    GGML_ASSERT(ggml_can_repeat(dst, src0));
			
 
				+
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    const int64_t ne00 = src0->ne[0];
			
 
				+    const int64_t ne01 = src0->ne[1];
			
 
				+    const int64_t ne02 = src0->ne[2];
			
 
				+    GGML_ASSERT(src0->ne[3] == 1);
			
 
				+
			
 
				+    const int64_t ne0 = dst->ne[0];
			
 
				+    const int64_t ne1 = dst->ne[1];
			
 
				+    const int64_t ne2 = dst->ne[2];
			
 
				+    GGML_ASSERT(dst->ne[3] == 1);
			
 
				+
			
 
				+    switch (dst->type) {
			
 
				+        case GGML_TYPE_F32: {
			
 
				+            const float * src0_d = (const float *) src0->data;
			
 
				+            float       * dst_d  = (float       *) dst->data;
			
 
				+            repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
			
 
				+        } break;
			
 
				+        default: {
			
 
				+            GGML_ASSERT(false);
			
 
				+        } break;
			
 
				+    }
			
 
				+}
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -31,3 +31,5 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
				 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+
			
 
				+void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/common.cuh
+++ b/llama/ggml-cuda/common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -76,6 +76,8 @@
 
				 #define CC_RDNA1      (CC_OFFSET_AMD + 1010)
			
 
				 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
			
 
				 #define CC_RDNA3      (CC_OFFSET_AMD + 1100)
			
 
				+#define CC_QY1        210
			
 
				+#define CC_QY2        220
			
 
				 
			
 
				 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
			
 
				 
			
@@ -160,6 +162,10 @@ typedef float2 dfloat2;
 
				 #define INT8_MMA_AVAILABLE
			
 
				 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
			
 
				 
			
 
				+#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
			
 
				+#define FLASH_ATTN_AVAILABLE
			
 
				+#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
			
 
				+
			
 
				 static constexpr bool fast_fp16_available(const int cc) {
			
 
				     return cc >= CC_PASCAL && cc != 610;
			
 
				 }
			
@@ -595,6 +601,7 @@ struct ggml_graph_node_properties {
 
				     int64_t ne[GGML_MAX_DIMS];
			
 
				     size_t nb[GGML_MAX_DIMS];
			
 
				     void * src_address[GGML_MAX_SRC];
			
 
				+    int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
			
 
				 };
			
 
				 
			
 
				 struct ggml_cuda_graph {
			
--- a/llama/ggml-cuda/concat.cu
+++ b/llama/ggml-cuda/concat.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cu
+++ b/llama/ggml-cuda/conv-transpose-1d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cu
+++ b/llama/ggml-cuda/convert.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cpy.cu
+++ b/llama/ggml-cuda/cpy.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -107,6 +107,17 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
 
				     }
			
 
				 }
			
 
				 
			
 
				+static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
			
 
				+    const block_q8_0 * xi = (const block_q8_0 *) cxi;
			
 
				+    float * dsti = (float *) cdsti;
			
 
				+
			
 
				+    const float d = (float)xi->d;
			
 
				+
			
 
				+    for (int j = 0; j < QK8_0; j++) {
			
 
				+       dsti[j] = xi->qs[j] * d;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
			
 
				     const float * xi = (const float *) cxi;
			
 
				     block_q4_0 * dsti = (block_q4_0 *) cdsti;
			
@@ -314,6 +325,32 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
 
				     cpy_blck(cx + x_offset, cdst + dst_offset);
			
 
				 }
			
 
				 
			
 
				+template <cpy_kernel_t cpy_blck, int qk>
			
 
				+static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
			
 
				+                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
			
 
				+                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
			
 
				+                                 const int nb12, const int nb13) {
			
 
				+    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
			
 
				+
			
 
				+    if (i >= ne) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int i03 = i/(ne00 * ne01 * ne02);
			
 
				+    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
			
 
				+    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
			
 
				+    const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
			
 
				+    const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
			
 
				+
			
 
				+    const int i13 = i/(ne10 * ne11 * ne12);
			
 
				+    const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
			
 
				+    const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
			
 
				+    const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
			
 
				+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
			
 
				+
			
 
				+    cpy_blck(cx + x_offset, cdst + dst_offset);
			
 
				+}
			
 
				+
			
 
				 static void ggml_cpy_f16_f32_cuda(
			
 
				     const char * cx, char * cdst, const int ne,
			
 
				     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
			
@@ -355,6 +392,16 @@ static void ggml_cpy_f32_q8_0_cuda(
 
				         (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
			
 
				 }
			
 
				 
			
 
				+static void ggml_cpy_q8_0_f32_cuda(
			
 
				+    const char * cx, char * cdst, const int ne,
			
 
				+    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
			
 
				+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
			
 
				+
			
 
				+    const int num_blocks = ne;
			
 
				+    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
			
 
				+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
			
 
				+}
			
 
				+
			
 
				 static void ggml_cpy_f32_q4_0_cuda(
			
 
				     const char * cx, char * cdst, const int ne,
			
 
				     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
			
@@ -454,12 +501,17 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
				     char * src0_ddc = (char *) src0->data;
			
 
				     char * src1_ddc = (char *) src1->data;
			
 
				 
			
 
				-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
			
 
				+    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
			
 
				+        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
			
 
				+        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
			
 
				+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
			
 
				         ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
			
 
				         ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
			
 
				         ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
			
 
				+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
			
 
				         ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
			
@@ -475,9 +527,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 
				     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
			
 
				         ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
			
 
				     } else {
			
 
				-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
			
 
				+        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
			
 
				                 ggml_type_name(src0->type), ggml_type_name(src1->type));
			
 
				-        GGML_ABORT("fatal error");
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -487,29 +538,32 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				 }
			
 
				 
			
 
				 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
			
 
				-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
			
 
				-            return (void*) cpy_f32_f16<cpy_1_f32_f32>;
			
 
				+    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
			
 
				+        return nullptr;
			
 
				+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
			
 
				+        return (void*) cpy_f32_f16<cpy_1_f32_f32>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
			
 
				-            return (void*) cpy_f32_f16<cpy_1_f32_f16>;
			
 
				+        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
			
 
				+    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
			
 
				+        return (void*) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
			
 
				     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
			
 
				-            return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
			
 
				+        return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
			
 
				     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
			
 
				-            return (void*) cpy_f32_f16<cpy_1_f32_f16>;
			
 
				+        return (void*) cpy_f32_f16<cpy_1_f32_f16>;
			
 
				     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
			
 
				-            return (void*) cpy_f32_f16<cpy_1_f16_f32>;
			
 
				+        return (void*) cpy_f32_f16<cpy_1_f16_f32>;
			
 
				     } else {
			
 
				-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
			
 
				+        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
			
 
				                 ggml_type_name(src0->type), ggml_type_name(src1->type));
			
 
				-        GGML_ABORT("fatal error");
			
 
				     }
			
 
				 }
			
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/cross-entropy-loss.cu
+++ b/llama/ggml-cuda/cross-entropy-loss.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -26,7 +26,7 @@
 
				 
			
 
				 #include "common.cuh"
			
 
				 #include "cross-entropy-loss.cuh"
			
 
				-#include "sumrows.cuh"
			
 
				+#include "sum.cuh"
			
 
				 
			
 
				 #include <cmath>
			
 
				 #include <cstdint>
			
@@ -97,6 +97,32 @@ static __global__ void cross_entropy_loss_f32(const float * logits, const float
 
				     dst[blockIdx.x] = loss;
			
 
				 }
			
 
				 
			
 
				+static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) {
			
 
				+    extern __shared__ float tmp[];
			
 
				+
			
 
				+    float maxval = -INFINITY;
			
 
				+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
			
 
				+        const float val = logits[blockIdx.x*nclasses + i];
			
 
				+        maxval = fmaxf(maxval, val);
			
 
				+        tmp[i] = val;
			
 
				+    }
			
 
				+    maxval = warp_reduce_max(maxval);
			
 
				+
			
 
				+    float sum = 0.0f;
			
 
				+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
			
 
				+        const float val = expf(tmp[i] - maxval);
			
 
				+        sum += val;
			
 
				+        tmp[i] = val;
			
 
				+    }
			
 
				+    sum = warp_reduce_sum(sum);
			
 
				+    const float sm_scale = 1.0f/sum;
			
 
				+
			
 
				+    const float d_by_nrows = *loss/gridDim.x;
			
 
				+    for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
			
 
				+        dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     const ggml_tensor * src0 = dst->src[0];
			
 
				     const ggml_tensor * src1 = dst->src[1];
			
@@ -128,5 +154,39 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
 
				     cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
			
 
				 
			
 
				     // Combine results from individual blocks:
			
 
				-    sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
			
 
				+    sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+    const ggml_tensor * opt0 = dst->src[2];
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(opt0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src1));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(opt0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(dst));
			
 
				+    GGML_ASSERT(ggml_are_same_shape(src0, src1));
			
 
				+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
			
 
				+
			
 
				+    const int64_t ne00  = src0->ne[0];
			
 
				+    const int64_t nrows = ggml_nrows(src0);
			
 
				+
			
 
				+    const float * src0_d = (const float *) src0->data;
			
 
				+    const float * src1_d = (const float *) src1->data;
			
 
				+    const float * opt0_d = (const float *) opt0->data;
			
 
				+    float       * dst_d  = (float       *) dst->data;
			
 
				+
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    const dim3 blocks_dim(WARP_SIZE, 1, 1);
			
 
				+    const dim3 blocks_num(nrows, 1, 1);
			
 
				+    const int shmem = ne00*sizeof(float);
			
 
				+
			
 
				+    cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00);
			
 
				 }
			
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -29,3 +29,5 @@
 
				 #define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
			
 
				 
			
 
				 void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				+
			
 
				+void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/dequantize.cuh
+++ b/llama/ggml-cuda/dequantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cu
+++ b/llama/ggml-cuda/diagmask.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dmmv.cu
+++ b/llama/ggml-cuda/dmmv.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/dmmv.cuh
+++ b/llama/ggml-cuda/dmmv.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-common.cuh
+++ b/llama/ggml-cuda/fattn-common.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cu
+++ b/llama/ggml-cuda/fattn-tile-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-tile-f32.cu
+++ b/llama/ggml-cuda/fattn-tile-f32.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -70,13 +70,17 @@ static __global__ void flash_attn_tile_ext_f32(
 
				         const int ne1,
			
 
				         const int ne2,
			
 
				         const int ne3) {
			
 
				+#ifndef FLASH_ATTN_AVAILABLE
			
 
				+    NO_DEVICE_CODE;
			
 
				+    return;
			
 
				+#endif // FLASH_ATTN_AVAILABLE
			
 
				     // Skip unused kernel variants for faster compilation:
			
 
				     if (use_logit_softcap && !(D == 128 || D == 256)) {
			
 
				         NO_DEVICE_CODE;
			
 
				         return;
			
 
				     }
			
 
				 
			
 
				-    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				+    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.
			
 
				 
			
 
				     const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
			
 
				     const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
			
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f16.cuh
+++ b/llama/ggml-cuda/fattn-vec-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-vec-f32.cuh
+++ b/llama/ggml-cuda/fattn-vec-f32.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn-wmma-f16.cuh
+++ b/llama/ggml-cuda/fattn-wmma-f16.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/fattn.cu
+++ b/llama/ggml-cuda/fattn.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -178,7 +178,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
 
				     }                                                                       \
			
 
				 
			
 
				 static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_tensor * Q = dst->src[1];
			
 
				+    ggml_tensor * Q = dst->src[0];
			
 
				     ggml_tensor * K = dst->src[1];
			
 
				     ggml_tensor * V = dst->src[2];
			
 
				 
			
@@ -253,7 +253,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
 
				     }                                                                       \
			
 
				 
			
 
				 static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    ggml_tensor * Q = dst->src[1];
			
 
				+    ggml_tensor * Q = dst->src[0];
			
 
				     ggml_tensor * K = dst->src[1];
			
 
				     ggml_tensor * V = dst->src[2];
			
 
				 
			
@@ -340,7 +340,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
 
				     }
			
 
				 
			
 
				     if (!fast_fp16_available(cc)) {
			
 
				-        if (Q->ne[1] <= 8) {
			
 
				+        if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
			
 
				             ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
			
 
				         } else {
			
 
				             ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
			
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cu
+++ b/llama/ggml-cuda/getrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/im2col.cu
+++ b/llama/ggml-cuda/im2col.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -95,7 +95,6 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     float * dst_d = (float *)dst->data;
			
 
				     cudaStream_t stream = ctx.stream();
			
 
				 
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
			
 
				     GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				     GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
			
 
				 
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mma.cuh
+++ b/llama/ggml-cuda/mma.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmq.cu
+++ b/llama/ggml-cuda/mmq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -52,7 +52,11 @@ void ggml_cuda_op_mul_mat_q(
 
				     // nrows_dst == nrows of the matrix that the kernel writes into
			
 
				     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
			
 
				 
			
 
				-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
			
 
				+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
			
 
				+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
			
 
				+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
			
 
				+    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
			
 
				+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
			
 
				 
			
 
				     switch (src0->type) {
			
 
				         case GGML_TYPE_Q4_0:
			
--- a/llama/ggml-cuda/mmq.cuh
+++ b/llama/ggml-cuda/mmq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -2768,6 +2768,7 @@ struct mmq_args {
 
				     int64_t ne00; int64_t ne01; int64_t stride01;
			
 
				     int64_t ne10; int64_t ne11; int64_t stride11;
			
 
				     int64_t ne0;
			
 
				+    bool use_stream_k;
			
 
				 };
			
 
				 
			
 
				 template<ggml_type type>
			
@@ -2803,8 +2804,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
				     const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
			
 
				     const dim3 block_nums_xy_tiling(nty, ntx, 1);
			
 
				 
			
 
				-    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
			
 
				-    if (!use_stream_k) {
			
 
				+    if (!args.use_stream_k) {
			
 
				         if (args.ne01 % mmq_y == 0) {
			
 
				             constexpr bool need_check = false;
			
 
				             mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>
			
--- a/llama/ggml-cuda/mmvq.cu
+++ b/llama/ggml-cuda/mmvq.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cu
+++ b/llama/ggml-cuda/norm.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/opt-step-adamw.cu
+++ b/llama/ggml-cuda/opt-step-adamw.cu
@@ -0,0 +1,106 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "opt-step-adamw.cuh"
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+static __global__ void opt_step_adamw_f32(
			
 
				+    float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
			
 
				+    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
			
 
				+    const float beta1h, const float beta2h) {
			
 
				+
			
 
				+    const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+
			
 
				+    if (i >= k) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const float gi = g[i];
			
 
				+    const float gmi = g_m[i]*beta1 +    gi*(1.0f - beta1);
			
 
				+    const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
			
 
				+
			
 
				+    g_m[i] = gmi;
			
 
				+    g_v[i] = gvi;
			
 
				+
			
 
				+    const float mh =       gmi*beta1h;
			
 
				+    const float vh = sqrtf(gvi*beta2h) + eps;
			
 
				+
			
 
				+    x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
			
 
				+}
			
 
				+
			
 
				+static void opt_step_adamw_f32_cuda(
			
 
				+    float * x, const float * g, float * g_m, float * g_v, const int64_t k,
			
 
				+    const float alpha, const float beta1, const float beta2, const float eps, const float wd,
			
 
				+    const float beta1h, const float beta2h, cudaStream_t stream) {
			
 
				+
			
 
				+    const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
			
 
				+    const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
			
 
				+    opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0        = dst->src[0];
			
 
				+    const ggml_tensor * src0_grad   = dst->src[1];
			
 
				+    const ggml_tensor * src0_grad_m = dst->src[2];
			
 
				+    const ggml_tensor * src0_grad_v = dst->src[3];
			
 
				+
			
 
				+    GGML_ASSERT(src0->type        == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src0_grad->type   == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0_grad));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
			
 
				+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
			
 
				+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
			
 
				+    GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
			
 
				+
			
 
				+    float       * src0_d        = (float       *) src0->data;
			
 
				+    const float * src0_grad_d   = (const float *) src0_grad->data;
			
 
				+    float       * src0_grad_m_d = (float       *) src0_grad_m->data;
			
 
				+    float       * src0_grad_v_d = (float       *) src0_grad_v->data;
			
 
				+
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    const int64_t ne = ggml_nelements(src0);
			
 
				+
			
 
				+    int64_t iter;  memcpy(&iter,  &dst->op_params[0], sizeof(int64_t));
			
 
				+    float   alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
			
 
				+    float   beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
			
 
				+    float   beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
			
 
				+    float   eps;   memcpy(&eps,   &dst->op_params[5], sizeof(float));
			
 
				+    float   wd;    memcpy(&wd,    &dst->op_params[6], sizeof(float));
			
 
				+
			
 
				+    const float beta1h  = alpha/(1.0f - powf(beta1, iter));
			
 
				+    const float beta2h  =  1.0f/(1.0f - powf(beta2, iter));
			
 
				+
			
 
				+    opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
			
 
				+
			
 
				+    iter++;
			
 
				+    memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
			
 
				+}
			
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
@@ -24,32 +24,8 @@
 
				  * SOFTWARE.
			
 
				  */
			
 
				 
			
 
				-// Implements a parser for an extended Backus-Naur form (BNF), producing the
			
 
				-// binary context-free grammar format specified by llama.h. Supports character
			
 
				-// ranges, grouping, and repetition operators. As an example, a grammar for
			
 
				-// arithmetic might look like:
			
 
				-//
			
 
				-// root  ::= expr
			
 
				-// expr  ::= term ([-+*/] term)*
			
 
				-// term  ::= num | "(" space expr ")" space
			
 
				-// num   ::= [0-9]+ space
			
 
				-// space ::= [ \t\n]*
			
 
				+#include "common.cuh"
			
 
				 
			
 
				-#pragma once
			
 
				-#include "llama.h"
			
 
				-#include <vector>
			
 
				-#include <map>
			
 
				-#include <cstdint>
			
 
				-#include <string>
			
 
				+#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
			
 
				 
			
 
				-namespace grammar_parser {
			
 
				-    struct parse_state {
			
 
				-        std::map<std::string, uint32_t>                 symbol_ids;
			
 
				-        std::vector<std::vector<llama_grammar_element>> rules;
			
 
				-
			
 
				-        std::vector<const llama_grammar_element *> c_rules();
			
 
				-    };
			
 
				-
			
 
				-    parse_state parse(const char * src);
			
 
				-    void print_grammar(FILE * file, const parse_state & state);
			
 
				-}
			
 
				+void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/out-prod.cu
+++ b/llama/ggml-cuda/out-prod.cu
@@ -0,0 +1,77 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "out-prod.cuh"
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+    const ggml_tensor * src1 = dst->src[1];
			
 
				+
			
 
				+    GGML_TENSOR_BINARY_OP_LOCALS
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(dst));
			
 
				+
			
 
				+    GGML_ASSERT(ne01 == ne11);
			
 
				+    GGML_ASSERT(ne0 == ne00);
			
 
				+    GGML_ASSERT(ne1 == ne10);
			
 
				+
			
 
				+    GGML_ASSERT(ne2 == src0->ne[2]);
			
 
				+    GGML_ASSERT(ne2 == src1->ne[2]);
			
 
				+    GGML_ASSERT(ne3 == src0->ne[3]);
			
 
				+    GGML_ASSERT(ne3 == src1->ne[3]);
			
 
				+
			
 
				+    const float * src0_d = (const float *) src0->data;
			
 
				+    const float * src1_d = (const float *) src1->data;
			
 
				+    float       *  dst_d = (float       *)  dst->data;
			
 
				+
			
 
				+    cudaStream_t   stream = ctx.stream();
			
 
				+    cublasHandle_t handle = ctx.cublas_handle();
			
 
				+
			
 
				+    const float alpha = 1.0f;
			
 
				+    const float beta = 0.0f;
			
 
				+
			
 
				+    GGML_ASSERT(ne2 == 1);
			
 
				+    GGML_ASSERT(ne3 == 1);
			
 
				+    CUBLAS_CHECK(cublasSetStream(handle, stream));
			
 
				+
			
 
				+    const bool src1_T = ggml_is_transposed(src1);
			
 
				+    const cublasOperation_t src1_cublas_op =  src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
			
 
				+    const int64_t           ldb            = (src1_T ?        nb10 :        nb11) /  sizeof(float);
			
 
				+    GGML_ASSERT(                             (src1_T ?        nb11 :        nb10) == sizeof(float));
			
 
				+
			
 
				+    CUBLAS_CHECK(
			
 
				+        cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
			
 
				+                ne0, ne1, ne01,
			
 
				+                &alpha, src0_d, ne00,
			
 
				+                        src1_d, ldb,
			
 
				+                &beta,  dst_d,  ne0));
			
 
				+}
			
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -0,0 +1,29 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/pad.cu
+++ b/llama/ggml-cuda/pad.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cu
+++ b/llama/ggml-cuda/pool2d.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cu
+++ b/llama/ggml-cuda/quantize.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cu
+++ b/llama/ggml-cuda/rope.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/rwkv-wkv.cu
+++ b/llama/ggml-cuda/rwkv-wkv.cu
@@ -0,0 +1,115 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+#include "rwkv-wkv.cuh"
			
 
				+
			
 
				+static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) {
			
 
				+    const int tid = threadIdx.x;
			
 
				+    const int bid = blockIdx.x;
			
 
				+
			
 
				+    const int head_size = CUDA_WKV_BLOCK_SIZE;
			
 
				+    const int batch_i = bid / H;
			
 
				+    const int head_i = bid % H;
			
 
				+    const int state_size = C * head_size;
			
 
				+    const int n_seq_tokens = T / B;
			
 
				+
			
 
				+    float state[head_size];
			
 
				+    __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size];
			
 
				+
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < head_size; i++) {
			
 
				+        state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid];
			
 
				+    }
			
 
				+
			
 
				+    __syncthreads();
			
 
				+    _tf[tid] = tf[head_i * head_size + tid];
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) {
			
 
				+        __syncthreads();
			
 
				+        _k[tid] = k[t];
			
 
				+        _r[tid] = r[t];
			
 
				+        _td[tid] = td[t];
			
 
				+        __syncthreads();
			
 
				+
			
 
				+        const float _v = v[t];
			
 
				+        float y = 0;
			
 
				+        for (int j = 0; j < head_size; j += 4) {
			
 
				+            const float4& k = (float4&)(_k[j]);
			
 
				+            const float4& r = (float4&)(_r[j]);
			
 
				+            const float4& tf = (float4&)(_tf[j]);
			
 
				+            const float4& td = (float4&)(_td[j]);
			
 
				+            float4& s = (float4&)(state[j]);
			
 
				+            float4 kv;
			
 
				+
			
 
				+            kv.x = k.x * _v;
			
 
				+            kv.y = k.y * _v;
			
 
				+            kv.z = k.z * _v;
			
 
				+            kv.w = k.w * _v;
			
 
				+
			
 
				+            y += r.x * (tf.x * kv.x + s.x);
			
 
				+            y += r.y * (tf.y * kv.y + s.y);
			
 
				+            y += r.z * (tf.z * kv.z + s.z);
			
 
				+            y += r.w * (tf.w * kv.w + s.w);
			
 
				+
			
 
				+            s.x = s.x * td.x + kv.x;
			
 
				+            s.y = s.y * td.y + kv.y;
			
 
				+            s.z = s.z * td.z + kv.z;
			
 
				+            s.w = s.w * td.w + kv.w;
			
 
				+        }
			
 
				+        dst[t] = y;
			
 
				+    }
			
 
				+
			
 
				+    #pragma unroll
			
 
				+    for (int i = 0; i < head_size; i++) {
			
 
				+        dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const float * k_d  = (const float *)dst->src[0]->data;
			
 
				+    const float * v_d  = (const float *)dst->src[1]->data;
			
 
				+    const float * r_d  = (const float *)dst->src[2]->data;
			
 
				+    const float * tf_d = (const float *)dst->src[3]->data;
			
 
				+    const float * td_d = (const float *)dst->src[4]->data;
			
 
				+    const float * s_d  = (const float *)dst->src[5]->data;
			
 
				+
			
 
				+    const int64_t B = dst->src[5]->ne[1];
			
 
				+    const int64_t T = dst->src[0]->ne[3];
			
 
				+    const int64_t C = dst->ne[0];
			
 
				+    const int64_t H = dst->src[0]->ne[2];
			
 
				+
			
 
				+    float * dst_d = (float *)dst->data;
			
 
				+
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(C % H == 0);
			
 
				+    GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE);
			
 
				+
			
 
				+    rwkv_wkv_f32<<<B * H, C / H, 0, stream>>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d);
			
 
				+}
			
--- a/llama/ggml-cuda/rwkv-wkv.cuh
+++ b/llama/ggml-cuda/rwkv-wkv.cuh
@@ -0,0 +1,31 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+
			
 
				+#define CUDA_WKV_BLOCK_SIZE 64
			
 
				+
			
 
				+void ggml_cuda_op_rwkv_wkv(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cu
+++ b/llama/ggml-cuda/softmax.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sum.cu
+++ b/llama/ggml-cuda/sum.cu
@@ -0,0 +1,73 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
			
 
				+#define USE_CUB
			
 
				+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
			
 
				+
			
 
				+#ifdef USE_CUB
			
 
				+// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
			
 
				+// For this reason CUB must be included BEFORE anything else.
			
 
				+#include <cub/cub.cuh>
			
 
				+using namespace cub;
			
 
				+#endif // USE_CUB
			
 
				+
			
 
				+#include "sumrows.cuh"
			
 
				+#include "sum.cuh"
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
			
 
				+#ifdef USE_CUB
			
 
				+    size_t tmp_size = 0;
			
 
				+    DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
			
 
				+    ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
			
 
				+    DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
			
 
				+#else
			
 
				+    // Use (inefficient) sum_rows implementation as a fallback.
			
 
				+    // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
			
 
				+    sum_rows_f32_cuda(x, dst, ne, 1, stream);
			
 
				+    GGML_UNUSED(pool);
			
 
				+#endif // USE_CUB
			
 
				+}
			
 
				+
			
 
				+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				+    const ggml_tensor * src0 = dst->src[0];
			
 
				+
			
 
				+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				+    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				+
			
 
				+    const float * src0_d = (const float *) src0->data;
			
 
				+    float * dst_d = (float *) dst->data;
			
 
				+
			
 
				+    const int64_t ne = ggml_nelements(src0);
			
 
				+
			
 
				+    ggml_cuda_pool & pool = ctx.pool();
			
 
				+    cudaStream_t stream = ctx.stream();
			
 
				+
			
 
				+    sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
			
 
				+}
			
--- a/llama/ggml-cuda/sum.cuh
+++ b/llama/ggml-cuda/sum.cuh
@@ -0,0 +1,31 @@
 
				+/**
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#include "common.cuh"
			
 
				+
			
 
				+void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
			
 
				+
			
 
				+void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,5 +1,5 @@
 
				 /**
			
 
				- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
			
 
				+ * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
			
 
				  *
			
 
				  * MIT License
			
 
				  *