Преглед изворни кода

llama: update vendored code to commit 46e3556 (#8308)

Jeffrey Morgan пре 3 месеци
родитељ
комит
1deafd8254
100 измењених фајлова са 447 додато и 327 уклоњено
  1. 0 2
      api/types.go
  2. 1 1
      llama/amx.cpp
  3. 1 1
      llama/amx.h
  4. 14 14
      llama/clip.cpp
  5. 1 1
      llama/clip.h
  6. 59 23
      llama/common.cpp
  7. 42 20
      llama/common.h
  8. 1 2
      llama/ggml-alloc.c
  9. 1 1
      llama/ggml-alloc.h
  10. 1 1
      llama/ggml-backend-impl.h
  11. 78 53
      llama/ggml-backend-reg.cpp
  12. 5 2
      llama/ggml-backend.cpp
  13. 1 1
      llama/ggml-backend.h
  14. 1 1
      llama/ggml-blas.cpp
  15. 1 1
      llama/ggml-blas.h
  16. 1 1
      llama/ggml-common.h
  17. 1 1
      llama/ggml-cpp.h
  18. 56 73
      llama/ggml-cpu-aarch64.cpp
  19. 1 1
      llama/ggml-cpu-aarch64.h
  20. 1 1
      llama/ggml-cpu-impl.h
  21. 6 2
      llama/ggml-cpu-quants.c
  22. 1 1
      llama/ggml-cpu-quants.h
  23. 1 1
      llama/ggml-cpu-traits.cpp
  24. 1 1
      llama/ggml-cpu-traits.h
  25. 7 7
      llama/ggml-cpu.c
  26. 10 1
      llama/ggml-cpu.cpp
  27. 1 1
      llama/ggml-cpu.h
  28. 1 1
      llama/ggml-cuda.h
  29. 1 1
      llama/ggml-cuda/acc.cu
  30. 1 1
      llama/ggml-cuda/acc.cuh
  31. 1 1
      llama/ggml-cuda/arange.cu
  32. 1 1
      llama/ggml-cuda/arange.cuh
  33. 1 1
      llama/ggml-cuda/argmax.cu
  34. 1 1
      llama/ggml-cuda/argmax.cuh
  35. 1 1
      llama/ggml-cuda/argsort.cu
  36. 1 1
      llama/ggml-cuda/argsort.cuh
  37. 1 1
      llama/ggml-cuda/binbcast.cu
  38. 1 1
      llama/ggml-cuda/binbcast.cuh
  39. 1 1
      llama/ggml-cuda/clamp.cu
  40. 1 1
      llama/ggml-cuda/clamp.cuh
  41. 1 1
      llama/ggml-cuda/common.cuh
  42. 1 1
      llama/ggml-cuda/concat.cu
  43. 1 1
      llama/ggml-cuda/concat.cuh
  44. 1 1
      llama/ggml-cuda/conv-transpose-1d.cu
  45. 1 1
      llama/ggml-cuda/conv-transpose-1d.cuh
  46. 3 1
      llama/ggml-cuda/convert.cu
  47. 1 1
      llama/ggml-cuda/convert.cuh
  48. 1 1
      llama/ggml-cuda/count-equal.cu
  49. 1 1
      llama/ggml-cuda/count-equal.cuh
  50. 1 1
      llama/ggml-cuda/cpy.cu
  51. 1 1
      llama/ggml-cuda/cpy.cuh
  52. 1 1
      llama/ggml-cuda/cross-entropy-loss.cu
  53. 1 1
      llama/ggml-cuda/cross-entropy-loss.cuh
  54. 1 1
      llama/ggml-cuda/dequantize.cuh
  55. 1 1
      llama/ggml-cuda/diagmask.cu
  56. 1 1
      llama/ggml-cuda/diagmask.cuh
  57. 1 1
      llama/ggml-cuda/fattn-common.cuh
  58. 1 1
      llama/ggml-cuda/fattn-tile-f16.cu
  59. 1 1
      llama/ggml-cuda/fattn-tile-f16.cuh
  60. 1 1
      llama/ggml-cuda/fattn-tile-f32.cu
  61. 1 1
      llama/ggml-cuda/fattn-tile-f32.cuh
  62. 1 1
      llama/ggml-cuda/fattn-vec-f16.cuh
  63. 1 1
      llama/ggml-cuda/fattn-vec-f32.cuh
  64. 1 1
      llama/ggml-cuda/fattn-wmma-f16.cuh
  65. 1 1
      llama/ggml-cuda/fattn.cu
  66. 1 1
      llama/ggml-cuda/fattn.cuh
  67. 1 1
      llama/ggml-cuda/getrows.cu
  68. 1 1
      llama/ggml-cuda/getrows.cuh
  69. 3 2
      llama/ggml-cuda/ggml-cuda.cu
  70. 1 1
      llama/ggml-cuda/im2col.cu
  71. 1 1
      llama/ggml-cuda/im2col.cuh
  72. 1 1
      llama/ggml-cuda/mma.cuh
  73. 1 1
      llama/ggml-cuda/mmq.cu
  74. 1 1
      llama/ggml-cuda/mmq.cuh
  75. 77 39
      llama/ggml-cuda/mmv.cu
  76. 1 1
      llama/ggml-cuda/mmv.cuh
  77. 1 1
      llama/ggml-cuda/mmvq.cu
  78. 1 1
      llama/ggml-cuda/mmvq.cuh
  79. 1 1
      llama/ggml-cuda/norm.cu
  80. 1 1
      llama/ggml-cuda/norm.cuh
  81. 1 1
      llama/ggml-cuda/opt-step-adamw.cu
  82. 1 1
      llama/ggml-cuda/opt-step-adamw.cuh
  83. 1 1
      llama/ggml-cuda/out-prod.cu
  84. 1 1
      llama/ggml-cuda/out-prod.cuh
  85. 1 1
      llama/ggml-cuda/pad.cu
  86. 1 1
      llama/ggml-cuda/pad.cuh
  87. 1 1
      llama/ggml-cuda/pool2d.cu
  88. 1 1
      llama/ggml-cuda/pool2d.cuh
  89. 1 1
      llama/ggml-cuda/quantize.cu
  90. 1 1
      llama/ggml-cuda/quantize.cuh
  91. 1 1
      llama/ggml-cuda/rope.cu
  92. 1 1
      llama/ggml-cuda/rope.cuh
  93. 1 1
      llama/ggml-cuda/scale.cu
  94. 1 1
      llama/ggml-cuda/scale.cuh
  95. 1 1
      llama/ggml-cuda/softmax.cu
  96. 1 1
      llama/ggml-cuda/softmax.cuh
  97. 1 1
      llama/ggml-cuda/sum.cu
  98. 1 1
      llama/ggml-cuda/sum.cuh
  99. 1 1
      llama/ggml-cuda/sumrows.cu
  100. 1 1
      llama/ggml-cuda/sumrows.cuh

+ 0 - 2
api/types.go

@@ -225,7 +225,6 @@ type Options struct {
 	Mirostat         int      `json:"mirostat,omitempty"`
 	MirostatTau      float32  `json:"mirostat_tau,omitempty"`
 	MirostatEta      float32  `json:"mirostat_eta,omitempty"`
-	PenalizeNewline  bool     `json:"penalize_newline,omitempty"`
 	Stop             []string `json:"stop,omitempty"`
 }
 
@@ -606,7 +605,6 @@ func DefaultOptions() Options {
 		Mirostat:         0,
 		MirostatTau:      5.0,
 		MirostatEta:      0.1,
-		PenalizeNewline:  true,
 		Seed:             -1,
 
 		Runner: Runner{

+ 1 - 1
llama/amx.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/amx.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 14 - 14
llama/clip.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -935,7 +935,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
                 mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
                 // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
 
                 // layer norm
                 // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
@@ -983,7 +983,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // block_2
             {
                 // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
 
                 // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
                 // layer norm
@@ -1044,7 +1044,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             // mlp_2 ne [24, 24, 2048, 1]
             mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
             // weight ne = [3, 3, 2048, 1]
-            struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
             peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
             peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
             mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
@@ -1262,28 +1262,28 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     }
 
 #ifdef GGML_USE_CUDA
-    new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
+   new_clip->backend = ggml_backend_cuda_init(0);
+   LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
-    LOG_INF("%s: CLIP using Metal backend\n", __func__);
+   new_clip->backend = ggml_backend_metal_init();
+   LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_CANN
-    new_clip->backend = ggml_backend_cann_init(0);
-    LOG_INF("%s: CLIP using CANN backend\n", __func__);
+   new_clip->backend = ggml_backend_cann_init(0);
+   LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_VULKAN
-    new_clip->backend = ggml_backend_vk_init(0);
-    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
+   new_clip->backend = ggml_backend_vk_init(0);
+   LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_SYCL
-    new_clip->backend = ggml_backend_sycl_init(0);
-    LOG_INF("%s: CLIP using SYCL backend\n", __func__);
+   new_clip->backend = ggml_backend_sycl_init(0);
+   LOG_INF("%s: CLIP using SYCL backend\n", __func__);
 #endif
 
     if (!new_clip->backend) {

+ 1 - 1
llama/clip.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 59 - 23
llama/common.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -44,6 +44,7 @@
 #include <cstdarg>
 #include <cstring>
 #include <ctime>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
@@ -88,7 +89,9 @@
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#define PATH_MAX MAX_PATH
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
 #else
 #include <sys/syslimits.h>
 #endif
@@ -912,9 +915,8 @@ struct common_init_result common_init_from_params(common_params & params) {
     }
 
     if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
-        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
-        llama_free_model(model);
-        return iparams;
+        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        params.ctx_shift = false;
     }
 
     if (!params.control_vectors.empty()) {
@@ -945,20 +947,21 @@ struct common_init_result common_init_from_params(common_params & params) {
 
     // load and optionally apply lora adapters
     for (auto & la : params.lora_adapters) {
-        common_lora_adapter_container loaded_la;
-        loaded_la.path = la.path;
-        loaded_la.scale = la.scale;
-        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
-        if (loaded_la.adapter == nullptr) {
+        llama_lora_adapter_ptr lora;
+        lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
+        if (lora == nullptr) {
             LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
             llama_free_model(model);
             return iparams;
         }
-        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
+
+        la.ptr = lora.get();
+        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
     }
+
     if (!params.lora_init_without_apply) {
-        common_lora_adapters_apply(lctx, iparams.lora_adapters);
+        common_lora_adapters_apply(lctx, params.lora_adapters);
     }
 
     if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -966,6 +969,25 @@ struct common_init_result common_init_from_params(common_params & params) {
         params.sampling.ignore_eos = false;
     }
 
+    if (params.sampling.ignore_eos) {
+        for (llama_token i = 0; i < llama_n_vocab(model); i++) {
+            if (llama_token_is_eog(model, i)) {
+                LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
+                params.sampling.logit_bias.push_back({i, -INFINITY});
+            }
+        }
+    }
+
+    if (params.sampling.penalty_last_n == -1) {
+        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    }
+
+    if (params.sampling.dry_penalty_last_n == -1) {
+        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    }
+
     if (params.warmup) {
         LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
@@ -1000,17 +1022,17 @@ struct common_init_result common_init_from_params(common_params & params) {
         llama_perf_context_reset(lctx);
     }
 
-    iparams.model   = model;
-    iparams.context = lctx;
+    iparams.model.reset(model);
+    iparams.context.reset(lctx);
 
     return iparams;
 }
 
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
     llama_lora_adapter_clear(ctx);
-    for (auto & la : lora_adapters) {
+    for (auto & la : lora) {
         if (la.scale != 0.0f) {
-            llama_lora_adapter_set(ctx, la.adapter, la.scale);
+            llama_lora_adapter_set(ctx, la.ptr, la.scale);
         }
     }
 }
@@ -1102,7 +1124,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 #define CURL_MAX_RETRY 3
 #define CURL_RETRY_DELAY_SECONDS 2
 
-static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
     int remaining_attempts = max_attempts;
 
     while (remaining_attempts > 0) {
@@ -1126,7 +1148,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
 }
 
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
     if (!curl) {
@@ -1156,8 +1177,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
 #endif
 
     // Check if the file already exists locally
-    struct stat model_file_info;
-    auto file_exists = (stat(path.c_str(), &model_file_info) == 0);
+    auto file_exists = std::filesystem::exists(path);
 
     // If the file exists, check its JSON metadata companion file.
     std::string metadata_path = path + ".json";
@@ -1199,11 +1219,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
         std::string etag;
         std::string last_modified;
     };
+
     common_load_model_from_url_headers headers;
+
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
 
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1618,6 +1640,18 @@ std::string common_detokenize(llama_context * ctx, const std::vector<llama_token
 // Chat template utils
 //
 
+std::string common_get_builtin_chat_template(const struct llama_model * model) {
+    static const char * template_key = "tokenizer.chat_template";
+    // call with NULL buffer to get the total size of the string
+    int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
+    if (res > 0) {
+        std::vector<char> model_template(res + 1, 0);
+        llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
+        return std::string(model_template.data(), model_template.size() - 1);
+    }
+    return "";
+}
+
 bool common_chat_verify_template(const std::string & tmpl) {
     llama_chat_message chat[] = {{"user", "test"}};
     int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
@@ -1787,7 +1821,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
             break;
         case 0: // max absolute
             for (int i = 0; i < n; i++) {
-                if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
+                if (sum < std::abs(inp[i])) {
+                    sum = std::abs(inp[i]);
+                }
             }
             sum /= 32760.0; // make an int16 range
             break;

+ 42 - 20
llama/common.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -28,7 +28,7 @@
 
 #pragma once
 
-#include "llama.h"
+#include "llama-cpp.h"
 
 #include <string>
 #include <vector>
@@ -53,10 +53,8 @@
 struct common_lora_adapter_info {
     std::string path;
     float scale;
-};
 
-struct common_lora_adapter_container : common_lora_adapter_info {
-    struct llama_lora_adapter * adapter;
+    struct llama_lora_adapter * ptr;
 };
 
 using llama_tokens = std::vector<llama_token>;
@@ -106,6 +104,7 @@ enum llama_example {
     LLAMA_EXAMPLE_LLAVA,
     LLAMA_EXAMPLE_LOOKUP,
     LLAMA_EXAMPLE_PARALLEL,
+    LLAMA_EXAMPLE_TTS,
 
     LLAMA_EXAMPLE_COUNT,
 };
@@ -121,6 +120,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
     COMMON_SAMPLER_TYPE_XTC         = 8,
     COMMON_SAMPLER_TYPE_INFILL      = 9,
+    COMMON_SAMPLER_TYPE_PENALTIES   = 10,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -156,7 +156,6 @@ struct common_params_sampling {
     int32_t mirostat           = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau       = 5.00f; // target entropy
     float   mirostat_eta       = 0.10f; // learning rate
-    bool    penalize_nl        = false; // consider newlines as a repeatable token
     bool    ignore_eos         = false;
     bool    no_perf            = false; // disable performance metrics
     bool    timing_per_token   = false;
@@ -165,6 +164,7 @@ struct common_params_sampling {
 
 
     std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_PENALTIES,
         COMMON_SAMPLER_TYPE_DRY,
         COMMON_SAMPLER_TYPE_TOP_K,
         COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -184,6 +184,7 @@ struct common_params_sampling {
 
 struct common_params_speculative {
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
     int32_t n_ctx        =     0; // draft context size
     int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
     int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
@@ -197,6 +198,14 @@ struct common_params_speculative {
     std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
 
+struct common_params_vocoder {
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
@@ -219,11 +228,13 @@ struct common_params {
     float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
 
     // offload params
-    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
-    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
-    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+
+    int32_t n_gpu_layers      = -1;  // number of layers to store in VRAM (-1 - use default)
+    int32_t main_gpu          = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
+
+    enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
@@ -237,8 +248,9 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct common_params_sampling sampling;
+    struct common_params_sampling    sampling;
     struct common_params_speculative speculative;
+    struct common_params_vocoder     vocoder;
 
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_alias          = ""; // model alias                                                   // NOLINT
@@ -490,10 +502,12 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
 
+// note: defines object's lifetime
 struct common_init_result {
-    struct llama_model   * model   = nullptr;
-    struct llama_context * context = nullptr;
-    std::vector<common_lora_adapter_container> lora_adapters;
+    llama_model_ptr   model;
+    llama_context_ptr context;
+
+    std::vector<llama_lora_adapter_ptr> lora;
 };
 
 struct common_init_result     common_init_from_params(common_params & params);
@@ -515,7 +529,7 @@ struct llama_model * common_load_model_from_hf(
     const struct llama_model_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
-void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
 
 //
 // Batch utils
@@ -583,6 +597,9 @@ struct common_chat_msg {
     std::string content;
 };
 
+// Get the built-in chat template for the model. Return empty string if not present.
+std::string common_get_builtin_chat_template(const struct llama_model * model);
+
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl);
 
@@ -619,7 +636,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
 // Embedding utils
 //
 
-void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+// TODO: repace embd_norm with an enum
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
 
 float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 
@@ -648,6 +666,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
 
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+namespace {
+
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+
+}

+ 1 - 2
llama/ggml-alloc.c

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -560,7 +560,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
         size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
         hn->buffer_id = buffer_id;
         hn->offset = offset;
-        return;
     }
 }
 

+ 1 - 1
llama/ggml-alloc.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-backend-impl.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 78 - 53
llama/ggml-backend-reg.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -92,6 +92,26 @@
 #include "ggml-kompute.h"
 #endif
 
+// disable C++17 deprecation warning for std::codecvt_utf8
+#if defined(__clang__)
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+static std::wstring utf8_to_utf16(const std::string & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.from_bytes(str);
+}
+
+static std::string utf16_to_utf8(const std::wstring & str) {
+    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
+    return converter.to_bytes(str);
+}
+
+#if defined(__clang__)
+#    pragma clang diagnostic pop
+#endif
+
 #ifdef _WIN32
 
 using dl_handle = std::remove_pointer_t<HMODULE>;
@@ -114,11 +134,6 @@ static dl_handle * dl_load_library(const std::wstring & path) {
     return handle;
 }
 
-static dl_handle * dl_load_library(const std::string & path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    return dl_load_library(converter.from_bytes(path));
-}
-
 static void * dl_get_sym(dl_handle * handle, const char * name) {
     DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
     SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
@@ -140,8 +155,8 @@ struct dl_handle_deleter {
     }
 };
 
-static void * dl_load_library(const std::string & path) {
-    dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
+static void * dl_load_library(const std::wstring & path) {
+    dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL);
 
     return handle;
 }
@@ -182,9 +197,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_CANN
         register_backend(ggml_backend_cann_reg());
 #endif
-#ifdef GGML_USE_BLAS
-        register_backend(ggml_backend_blas_reg());
-#endif
+// #ifdef GGML_USE_BLAS
+//         register_backend(ggml_backend_blas_reg());
+// #endif
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
@@ -228,11 +243,11 @@ struct ggml_backend_registry {
         devices.push_back(device);
     }
 
-    ggml_backend_reg_t load_backend(const char * path, bool silent) {
+    ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) {
         dl_handle_ptr handle { dl_load_library(path) };
         if (!handle) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -240,7 +255,7 @@ struct ggml_backend_registry {
         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
         if (score_fn && score_fn() == 0) {
             if (!silent) {
-                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
+                GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -248,7 +263,7 @@ struct ggml_backend_registry {
         auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
         if (!backend_init_fn) {
             if (!silent) {
-                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
+                GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str());
             }
             return nullptr;
         }
@@ -257,16 +272,16 @@ struct ggml_backend_registry {
         if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
             if (!silent) {
                 if (!reg) {
-                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
+                    GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str());
                 } else {
                     GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
-                        __func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
+                        __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION);
                 }
             }
             return nullptr;
         }
 
-        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
+        GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str());
 
         register_backend(reg, std::move(handle));
 
@@ -402,14 +417,14 @@ ggml_backend_t ggml_backend_init_best(void) {
 
 // Dynamic loading
 ggml_backend_reg_t ggml_backend_load(const char * path) {
-    return get_reg().load_backend(path, false);
+    return get_reg().load_backend(utf8_to_utf16(path), false);
 }
 
 void ggml_backend_unload(ggml_backend_reg_t reg) {
     get_reg().unload_backend(reg, true);
 }
 
-static std::string get_executable_path() {
+static std::wstring get_executable_path() {
 #if defined(__APPLE__)
     // get executable path
     std::vector<char> path;
@@ -427,13 +442,17 @@ static std::string get_executable_path() {
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "/";
-#elif defined(__linux__)
+    return utf8_to_utf16(base_path + "/");
+#elif defined(__linux__) || defined(__FreeBSD__)
     std::string base_path = ".";
     std::vector<char> path(1024);
     while (true) {
         // get executable path
+#    if defined(__linux__)
         ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
+#    elif defined(__FreeBSD__)
+        ssize_t len = readlink("/proc/curproc/file", path.data(), path.size());
+#    endif
         if (len == -1) {
             break;
         }
@@ -449,57 +468,63 @@ static std::string get_executable_path() {
         path.resize(path.size() * 2);
     }
 
-    return base_path + "/";
+    return utf8_to_utf16(base_path + "/");
 #elif defined(_WIN32)
-    std::vector<char> path(MAX_PATH);
-    DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
+    std::vector<wchar_t> path(MAX_PATH);
+    DWORD len = GetModuleFileNameW(NULL, path.data(), path.size());
     if (len == 0) {
-        return "";
+        return {};
     }
-    std::string base_path(path.data(), len);
+    std::wstring base_path(path.data(), len);
     // remove executable name
     auto last_slash = base_path.find_last_of('\\');
     if (last_slash != std::string::npos) {
         base_path = base_path.substr(0, last_slash);
     }
-    return base_path + "\\";
+    return base_path + L"\\";
+#else
+    return {};
+#endif
+}
+
+static std::wstring backend_filename_prefix() {
+#ifdef _WIN32
+    return L"ggml-";
+#else
+    return L"libggml-";
 #endif
 }
 
-static std::string backend_filename_prefix() {
+static std::wstring backend_filename_suffix() {
 #ifdef _WIN32
-    return "ggml-";
+    return L".dll";
 #else
-    return "libggml-";
+    return L".so";
 #endif
 }
 
-static std::string backend_filename_suffix() {
+static std::wstring path_separator() {
 #ifdef _WIN32
-    return ".dll";
+    return L"\\";
 #else
-    return ".so";
+    return L"/";
 #endif
 }
 
 static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) {
     // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
      // TODO: search system paths
-    std::string file_prefix = backend_filename_prefix() + name + "-";
-    std::vector<std::string> search_paths;
+    std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-";
+    std::vector<std::wstring> search_paths;
     if (user_search_path == nullptr) {
-        search_paths.push_back("./");
+        search_paths.push_back(L"." + path_separator());
         search_paths.push_back(get_executable_path());
     } else {
-#if defined(_WIN32)
-        search_paths.push_back(std::string(user_search_path) + "\\");
-#else
-        search_paths.push_back(std::string(user_search_path) + "/");
-#endif
+        search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator());
     }
 
     int best_score = 0;
-    std::string best_path;
+    std::wstring best_path;
 
     namespace fs = std::filesystem;
     for (const auto & search_path : search_paths) {
@@ -509,27 +534,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
         fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
         for (const auto & entry : dir_it) {
             if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                std::string ext = entry.path().extension().string();
+                std::wstring filename = entry.path().filename().wstring();
+                std::wstring ext = entry.path().extension().wstring();
                 if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
-                    dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
+                    dl_handle_ptr handle { dl_load_library(entry.path().wstring()) };
                     if (!handle && !silent) {
-                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
+                        GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                     }
                     if (handle) {
                         auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
                         if (score_fn) {
                             int s = score_fn();
 #ifndef NDEBUG
-                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
+                            GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s);
 #endif
                             if (s > best_score) {
                                 best_score = s;
-                                best_path = entry.path().string();
+                                best_path = entry.path().wstring();
                             }
                         } else {
                             if (!silent) {
-                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
+                                GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str());
                             }
                         }
                     }
@@ -541,15 +566,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
     if (best_score == 0) {
         // try to load the base backend
         for (const auto & search_path : search_paths) {
-            std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
+            std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix();
             if (fs::exists(path)) {
-                return get_reg().load_backend(path.c_str(), silent);
+                return get_reg().load_backend(path, silent);
             }
         }
         return nullptr;
     }
 
-    return get_reg().load_backend(best_path.c_str(), silent);
+    return get_reg().load_backend(best_path, silent);
 }
 
 void ggml_backend_load_all() {

+ 5 - 2
llama/ggml-backend.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -826,9 +826,12 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
     for (int i = 0; i < graph->n_nodes; i++) {
         if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
             ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
-            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
+            GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, ggml_backend_name(split_backend),
                 sched->splits[cur_split].n_inputs);
             for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
+                if (j == 0) {
+                    GGML_LOG_DEBUG(": ");
+                }
                 GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
                     fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
             }

+ 1 - 1
llama/ggml-backend.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-blas.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-blas.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-common.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cpp.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 56 - 73
llama/ggml-cpu-aarch64.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -220,9 +220,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
 }
 
 static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     return _mm256_dpbusd_epi32(zero, ax, sy);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    return _mm256_dpbusd_avx_epi32(zero, ax, sy);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -590,21 +593,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
 
 #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
-        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx;
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
 
         for (int c = 0; c < nc; c += ncols_interleaved) {
-            const block_q8_0 * a_ptr = (const block_q8_0 *)vy;
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
             float32x4_t acc = vdupq_n_f32(0);
             for (int b = 0; b < nb; b++) {
-                int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs);
-                int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16);
-                int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32);
-                int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48);
-                float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d);
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
 
                 int8x16_t a0 = vld1q_s8(a_ptr->qs);
                 int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2);
-                float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
 
                 int32x4_t ret = vdupq_n_s32(0);
 
@@ -673,72 +676,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
     UNUSED(ncols_interleaved);
     UNUSED(blocklen);
 
-#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        const void * b_ptr = vx;
-        const void * a_ptr = vy;
-        float * res_ptr = s;
+#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
+    if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
+        const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx;
 
-        __asm__ __volatile__(
-            "movi v2.16b, #0x4\n"
-            "movi v1.16b, #0xf0\n"
-            "add %x[b_ptr], %x[b_ptr], #0x8\n"
-            "1:"  // Column loop
-            "add x23, %x[a_ptr], #0x2\n"
-            "movi v0.16b, #0x0\n"
-            "mov x22, %x[nb]\n"
-            "2:"  // Block loop
-            "ldr q31, [%x[b_ptr], #0x0]\n"
-            "ldr q30, [%x[b_ptr], #0x10]\n"
-            "mov x21, x23\n"
-            "movi v29.4s, #0x0\n"
-            "ldr q28, [%x[b_ptr], #0x20]\n"
-            "ldr q27, [%x[b_ptr], #0x30]\n"
-            "movi v26.4s, #0x0\n"
-            "sub x20, x23, #0x2\n"
-            "ld1r { v25.8h }, [x20]\n"
-            "ldr q24, [%x[b_ptr], #-0x8]\n"
-            "sub x22, x22, #0x1\n"
-            "add x23, x23, #0x22\n"
-            "ld1r { v23.2d }, [x21], #0x8\n"
-            "sshl v22.16b, v31.16b, v2.16b\n"
-            "sshl v16.16b, v30.16b, v2.16b\n"
-            "add %x[b_ptr], %x[b_ptr], #0x48\n"
-            "ld1r { v21.2d }, [x21], #0x8\n"
-            "sshl v20.16b, v28.16b, v2.16b\n"
-            "sshl v19.16b, v27.16b, v2.16b\n"
-            "ld1r { v18.2d }, [x21], #0x8\n"
-            "ld1r { v17.2d }, [x21], #0x8\n"
-            "and v31.16b, v31.16b, v1.16b\n"
-            "and v30.16b, v30.16b, v1.16b\n"
-            ".inst 0x4e9796dd  // sdot v29.4s, v22.16b, v23.16b\n"
-            ".inst 0x4e97961a  // sdot v26.4s, v16.16b, v23.16b\n"
-            "and v28.16b, v28.16b, v1.16b\n"
-            "and v27.16b, v27.16b, v1.16b\n"
-            "fcvtl v25.4s, v25.4h\n"
-            "fcvtl v16.4s, v24.4h\n"
-            ".inst 0x4e95969d  // sdot v29.4s, v20.16b, v21.16b\n"
-            ".inst 0x4e95967a  // sdot v26.4s, v19.16b, v21.16b\n"
-            "fmul v16.4s, v16.4s, v25.4s\n"
-            ".inst 0x4e9297fd  // sdot v29.4s, v31.16b, v18.16b\n"
-            ".inst 0x4e9297da  // sdot v26.4s, v30.16b, v18.16b\n"
-            ".inst 0x4e91979d  // sdot v29.4s, v28.16b, v17.16b\n"
-            ".inst 0x4e91977a  // sdot v26.4s, v27.16b, v17.16b\n"
-            "addp v29.4s, v29.4s, v26.4s\n"
-            "scvtf v29.4s, v29.4s, #0x4\n"
-            "fmla v0.4s, v29.4s, v16.4s\n"
-            "cbnz x22, 2b\n"
-            "sub %x[nc], %x[nc], #0x4\n"
-            "str q0, [%x[res_ptr], #0x0]\n"
-            "add %x[res_ptr], %x[res_ptr], #0x10\n"
-            "cbnz %x[nc], 1b\n"
-            : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
-            : [a_ptr] "r" (a_ptr), [nb] "r" (nb)
-            : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"
-        );
+        for (int c = 0; c < nc; c += ncols_interleaved) {
+            const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
+            float32x4_t acc = vdupq_n_f32(0);
+            for (int b = 0; b < nb; b++) {
+                int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs);
+                int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16);
+                int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32);
+                int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48);
+                float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d);
+
+                int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs);
+                int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1);
+                int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2);
+                int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3);
+                float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d);
+
+                int32x4_t ret0 = vdupq_n_s32(0);
+                int32x4_t ret1 = vdupq_n_s32(0);
+
+                ret0 = vdotq_s32(ret0, b0 << 4, a0);
+                ret1 = vdotq_s32(ret1, b1 << 4, a0);
+                ret0 = vdotq_s32(ret0, b2 << 4, a1);
+                ret1 = vdotq_s32(ret1, b3 << 4, a1);
+
+                ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2);
+                ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2);
+                ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3);
+                ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3);
+
+                int32x4_t ret = vpaddq_s32(ret0, ret1);
+
+                acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4),
+                        vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd)));
+                a_ptr++;
+                b_ptr++;
+            }
+            vst1q_f32(s, acc);
+            s += ncols_interleaved;
+        }
         return;
     }
-#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD)
     float sumf[4];
     int sumi;
 

+ 1 - 1
llama/ggml-cpu-aarch64.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cpu-impl.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 6 - 2
llama/ggml-cpu-quants.c

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -129,10 +129,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
+#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
     const __m256i zero = _mm256_setzero_si256();
     const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
     return _mm256_cvtepi32_ps(summed_pairs);
+#elif defined(__AVXVNNI__)
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
 #else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);

+ 1 - 1
llama/ggml-cpu-quants.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cpu-traits.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cpu-traits.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 7 - 7
llama/ggml-cpu.c

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -1012,7 +1012,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
 #define GGML_F16_STEP 32
 #define GGML_F16_EPR  4
 
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
     float tmp[4];
 
     tmp[0] = GGML_FP16_TO_FP32(x[0]);
@@ -1023,7 +1023,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
     return _mm_loadu_ps(tmp);
 }
 
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
     float arr[4];
 
     _mm_storeu_ps(arr, y);
@@ -7445,14 +7445,14 @@ static void ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
                                      nb11/ggml_type_size(src1->type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      src1->type,
                                      dst->type))
@@ -7497,14 +7497,14 @@ UseGgmlGemm1:;
 
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
                                      nb01/ggml_type_size(src0->type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                      row_size/ggml_type_size(vec_dot_type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/ggml_type_size(dst->type),
-                                     ith, nth,
                                      src0->type,
                                      vec_dot_type,
                                      dst->type))

+ 10 - 1
llama/ggml-cpu.cpp

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -419,8 +419,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
     switch (op->op) {
         case GGML_OP_CPY:
             return
+                op->type != GGML_TYPE_IQ3_XXS &&
+                op->type != GGML_TYPE_IQ3_S   &&
                 op->type != GGML_TYPE_IQ2_XXS &&
                 op->type != GGML_TYPE_IQ2_XS  &&
+                op->type != GGML_TYPE_IQ2_S   &&
                 op->type != GGML_TYPE_IQ1_S   &&
                 op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
@@ -544,6 +547,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_sve()) {
             features.push_back({ "SVE", "1" });
         }
+        if (ggml_cpu_has_dotprod()) {
+            features.push_back({ "DOTPROD", "1" });
+        }
+        if (ggml_cpu_has_matmul_int8()) {
+            features.push_back({ "MATMUL_INT8", "1" });
+        }
         if (ggml_cpu_get_sve_cnt() > 0) {
             static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
             features.push_back({ "SVE_CNT", sve_cnt.c_str() });

+ 1 - 1
llama/ggml-cpu.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda.h

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/acc.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/acc.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/arange.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/arange.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/argmax.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/argmax.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/argsort.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/argsort.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/binbcast.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/binbcast.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/clamp.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/clamp.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/common.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/concat.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/concat.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/conv-transpose-1d.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/conv-transpose-1d.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 3 - 1
llama/ggml-cuda/convert.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -706,6 +706,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_iq3_s_cuda;
         case GGML_TYPE_F16:
             return convert_unary_cuda<half>;
+        case GGML_TYPE_BF16:
+            return convert_unary_cuda<nv_bfloat16>;
         default:
             return nullptr;
     }

+ 1 - 1
llama/ggml-cuda/convert.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/count-equal.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/count-equal.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/cpy.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/cpy.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/cross-entropy-loss.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/cross-entropy-loss.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/dequantize.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/diagmask.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/diagmask.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-common.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-tile-f16.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-tile-f16.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-tile-f32.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-tile-f32.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-vec-f16.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-vec-f32.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn-wmma-f16.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/fattn.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/getrows.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/getrows.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 3 - 2
llama/ggml-cuda/ggml-cuda.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -1758,7 +1758,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
-    bool use_mul_mat_vec   = src0->type == GGML_TYPE_F16
+    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
@@ -2904,6 +2904,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_IQ3_XXS:
                     case GGML_TYPE_IQ4_NL:
                     case GGML_TYPE_IQ4_XS:
+                    case GGML_TYPE_BF16:
 #ifdef GGML_USE_MUSA
                         if (a->type == GGML_TYPE_Q3_K) {
                             return false;

+ 1 - 1
llama/ggml-cuda/im2col.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/im2col.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/mma.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/mmq.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/mmq.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 77 - 39
llama/ggml-cuda/mmv.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *
@@ -27,9 +27,9 @@
 #include "common.cuh"
 #include "mmv.cuh"
 
-template <typename type_acc, int block_size>
+template <typename T, typename type_acc, int block_size>
 static __global__ void mul_mat_vec(
-        const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
+        const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
         const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
     const int64_t row     = blockIdx.x;
     const int64_t channel = blockIdx.z;
@@ -39,7 +39,6 @@ static __global__ void mul_mat_vec(
     y   +=  channel               *stride_channel_y;
     dst +=  channel               *stride_channel_dst;
 
-    const half2  * x2 = (const half2  *) x;
     const float2 * y2 = (const float2 *) y;
 
     extern __shared__ char data_mmv[];
@@ -54,28 +53,44 @@ static __global__ void mul_mat_vec(
 
     float sumf;
 
-    if (std::is_same<type_acc, float>::value) {
-        sumf = 0.0f;
+    if constexpr (std::is_same<T, half>::value) {
+        const half2 * x2 = (const half2 *) x;
 
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-            const float2 tmpx = __half22float2(x2[col2]);
-            const float2 tmpy = y2[col2];
-            sumf += tmpx.x * tmpy.x;
-            sumf += tmpx.y * tmpy.y;
-        }
-    } else {
+        if (std::is_same<type_acc, float>::value) {
+            sumf = 0.0f;
+
+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+                const float2 tmpx = __half22float2(x2[col2]);
+                const float2 tmpy = y2[col2];
+                sumf += tmpx.x * tmpy.x;
+                sumf += tmpx.y * tmpy.y;
+            }
+        } else {
 #ifdef FP16_AVAILABLE
-        half2 sumh2 = make_half2(0.0f, 0.0f);
+            half2 sumh2 = make_half2(0.0f, 0.0f);
 
-        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
-            const float2 tmp = y2[col2];
-            sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
-        }
+            for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+                const float2 tmp = y2[col2];
+                sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
+            }
 
-        sumf = __low2float(sumh2) + __high2float(sumh2);
+            sumf = __low2float(sumh2) + __high2float(sumh2);
 #else
-        NO_DEVICE_CODE;
+            NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
+        }
+    } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
+        const int * x2 = (const int *) x;
+        sumf = 0.0f;
+
+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            const int    tmpx = x2[col2];
+            const float2 tmpy = y2[col2];
+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+            sumf += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+        }
+    } else {
+        static_assert(std::is_same<T, void>::value, "unsupported type");
     }
 
     sumf = warp_reduce_sum(sumf);
@@ -97,9 +112,9 @@ static __global__ void mul_mat_vec(
     dst[row] = sumf;
 }
 
-template <typename type_acc>
+template <typename T, typename type_acc>
 static void launch_mul_mat_vec_cuda(
-        const half * x, const float * y, float * dst,
+        const T * x, const float * y, float * dst,
         const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
         cudaStream_t stream) {
@@ -123,35 +138,35 @@ static void launch_mul_mat_vec_cuda(
     const dim3 block_dims(block_size_best, 1, 1);
     switch (block_size_best) {
         case   32: {
-            mul_mat_vec<type_acc,  32><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc,  32><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case   64: {
-            mul_mat_vec<type_acc,  64><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc,  64><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case   96: {
-            mul_mat_vec<type_acc,  96><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc,  96><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case  128: {
-            mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case  160: {
-            mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case  192: {
-            mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case  224: {
-            mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         case  256: {
-            mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
+            mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
                 (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
         } break;
         default: {
@@ -160,25 +175,25 @@ static void launch_mul_mat_vec_cuda(
     }
 }
 
+template<typename T>
 static void mul_mat_vec_cuda(
-        const half * x, const float * y, float * dst,
+        const T * x, const float * y, float * dst,
         const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
         enum ggml_prec prec, cudaStream_t stream) {
     switch (prec) {
         case GGML_PREC_DEFAULT: {
-            launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+            launch_mul_mat_vec_cuda<T, half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
                 stride_channel_x, stride_channel_y, stride_channel_dst, stream);
         } break;
         case GGML_PREC_F32: {
-            launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
+            launch_mul_mat_vec_cuda<T, float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
                 stride_channel_x, stride_channel_y, stride_channel_dst, stream);
         } break;
     }
 }
 
 void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 
@@ -190,7 +205,6 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
 
-    const half  * src0_d = (const half  *) src0->data;
     const float * src1_d = (const float *) src1->data;
     float       *  dst_d = (float       *)  dst->data;
 
@@ -207,7 +221,20 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
     const int64_t channel_stride_y   = src1->nb[2] / ggml_type_size(src1->type);
     const int64_t channel_stride_dst =  dst->nb[2] / ggml_type_size( dst->type);
 
-    mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+    switch (src0->type) {
+        case GGML_TYPE_F16: {
+            const half * src0_d = (const half *) src0->data;
+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
+                channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+        } break;
+        case GGML_TYPE_BF16: {
+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
+            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12,
+                channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
+        } break;
+        default:
+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+    }
 }
 
 void ggml_cuda_op_mul_mat_vec(
@@ -216,7 +243,6 @@ void ggml_cuda_op_mul_mat_vec(
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream) {
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
     GGML_ASSERT(dst->type  == GGML_TYPE_F32);
 
@@ -237,8 +263,20 @@ void ggml_cuda_op_mul_mat_vec(
     const int64_t channel_stride_y   = 0;
     const int64_t channel_stride_dst = 0;
 
-    mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
-        nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+    switch (src0->type) {
+        case GGML_TYPE_F16: {
+            const half * src0_d = (const half *) src0_dd_i;
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+        } break;
+        case GGML_TYPE_BF16: {
+            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
+        } break;
+        default:
+            GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
+    }
 
     GGML_UNUSED(ctx);
     GGML_UNUSED(src1);

+ 1 - 1
llama/ggml-cuda/mmv.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/mmvq.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/mmvq.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/norm.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/norm.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/opt-step-adamw.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/opt-step-adamw.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/out-prod.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/out-prod.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/pad.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/pad.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/pool2d.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/pool2d.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/quantize.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/quantize.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/rope.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/rope.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/scale.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/scale.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/softmax.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/softmax.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/sum.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/sum.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/sumrows.cu

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

+ 1 - 1
llama/ggml-cuda/sumrows.cuh

@@ -1,5 +1,5 @@
 /**
- * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
+ * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  *
  * MIT License
  *

Неке датотеке нису приказане због велике количине промена