9 miesięcy temu · f8fedbda20
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
 
															-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
														
 
															+Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa
														
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 
															 diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 2b9ace28..172640e2 100644
														
 
															+index 8fe51971..7113ba64 100644
														
 
															 --- a/src/llama.cpp
														
 
															 +++ b/src/llama.cpp
														
 
															-@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
														
 
															+@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
														
 
															          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
														
 
															              vocab.tokenizer_add_space_prefix = false;
														
 
															              vocab.tokenizer_clean_spaces = true;
														
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
 
															                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															              } else if (
														
 
															                      tokenizer_pre == "llama3"   ||
														
 
															-@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
														
 
															-                 tokenizer_pre == "jais") {
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
														
 
															+@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
														
 
															+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
														
 
															+                 vocab.tokenizer_clean_spaces = false;
														
 
															              } else {
														
 
															 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
														
 
															 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
														
--- a/llm/patches/06-embeddings.diff
+++ b/llm/patches/06-embeddings.diff
--- a/llm/patches/06-qwen2.diff
+++ b/llm/patches/06-qwen2.diff
@@ -1,13 +0,0 @@
 
															-diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 40d2ec2c..f34eb79a 100644
														
 
															---- a/src/llama.cpp
														
 
															-+++ b/src/llama.cpp
														
 
															-@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
														
 
															-         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
														
 
															-         cb(kq, "kq", il);
														
 
															- 
														
 
															--        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
														
 
															-+        if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
														
 
															-             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
														
 
															-             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
														
 
															-             ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
														
--- a/llm/patches/07-clip-unicode.diff
+++ b/llm/patches/07-clip-unicode.diff
--- a/llm/patches/08-pooling.diff
+++ b/llm/patches/08-pooling.diff
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -0,0 +1,360 @@
 
															+diff --git a/common/common.cpp b/common/common.cpp
														
 
															+index dbb724fb..c26fe6ee 100644
														
 
															+--- a/common/common.cpp
														
 
															++++ b/common/common.cpp
														
 
															+@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
														
 
															+     for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
														
 
															+         const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
														
 
															+         float lora_scale = std::get<1>(params.lora_adapter[i]);
														
 
															++
														
 
															++        // try to load as gguf
														
 
															+         auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
														
 
															+         if (adapter == nullptr) {
														
 
															+-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
														
 
															+-            llama_free(lctx);
														
 
															+-            llama_free_model(model);
														
 
															+-            return std::make_tuple(nullptr, nullptr);
														
 
															++            fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
														
 
															++
														
 
															++            // if that fails, try loading as ggla for compatibility
														
 
															++            int err = llama_model_apply_lora_from_file(model,
														
 
															++                                                    lora_adapter.c_str(),
														
 
															++                                                    lora_scale,
														
 
															++                                                    ((i > 0) || params.lora_base.empty())
														
 
															++                                                        ? NULL
														
 
															++                                                        : params.lora_base.c_str(),
														
 
															++                                                    params.n_threads);
														
 
															++            if (err != 0) {
														
 
															++                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
														
 
															++                llama_free(lctx);
														
 
															++                llama_free_model(model);
														
 
															++                return std::make_tuple(nullptr, nullptr);
														
 
															++            }
														
 
															++        } else {
														
 
															++            llama_lora_adapter_set(lctx, adapter, lora_scale);
														
 
															+         }
														
 
															+-        llama_lora_adapter_set(lctx, adapter, lora_scale);
														
 
															+     }
														
 
															+ 
														
 
															+     if (params.ignore_eos) {
														
 
															+diff --git a/include/llama.h b/include/llama.h
														
 
															+index 93fd77ca..b0fb37a6 100644
														
 
															+--- a/include/llama.h
														
 
															++++ b/include/llama.h
														
 
															+@@ -1160,6 +1160,20 @@ extern "C" {
														
 
															+ 
														
 
															+     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
														
 
															+ 
														
 
															++    // Apply a LoRA adapter to a loaded model
														
 
															++    // path_base_model is the path to a higher quality model to use as a base for
														
 
															++    // the layers modified by the adapter. Can be NULL to use the current loaded model.
														
 
															++    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
														
 
															++    // will be applied on top of the previous one
														
 
															++    // Returns 0 on success
														
 
															++    LLAMA_API int32_t llama_model_apply_lora_from_file(
														
 
															++            const struct llama_model * model,
														
 
															++                            const char * path_lora,
														
 
															++                                float   scale,
														
 
															++                            const char * path_base_model,
														
 
															++                                int32_t   n_threads);
														
 
															++
														
 
															++
														
 
															+ #ifdef __cplusplus
														
 
															+ }
														
 
															+ #endif
														
 
															+diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															+index 80a0dd0f..9d7b0e17 100644
														
 
															+--- a/src/llama.cpp
														
 
															++++ b/src/llama.cpp
														
 
															+@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
														
 
															+     fputs(text, stderr);
														
 
															+     fflush(stderr);
														
 
															+ }
														
 
															++
														
 
															++static int llama_apply_lora_from_file_internal(
														
 
															++    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
														
 
															++) {
														
 
															++    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
														
 
															++
														
 
															++    const int64_t t_start_lora_us = ggml_time_us();
														
 
															++
														
 
															++    llama_file fin(path_lora, "rb");
														
 
															++
														
 
															++    // verify magic and version
														
 
															++    {
														
 
															++        uint32_t magic = fin.read_u32();
														
 
															++        if (magic != LLAMA_FILE_MAGIC_GGLA) {
														
 
															++            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        uint32_t format_version = fin.read_u32();
														
 
															++        if (format_version != 1) {
														
 
															++            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
														
 
															++            return 1;
														
 
															++        }
														
 
															++    }
														
 
															++
														
 
															++    int32_t lora_r = fin.read_u32();
														
 
															++    int32_t lora_alpha = fin.read_u32();
														
 
															++    float scaling = scale * (float)lora_alpha / (float)lora_r;
														
 
															++
														
 
															++    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
														
 
															++
														
 
															++    // load base model
														
 
															++    std::unique_ptr<llama_model_loader> ml;
														
 
															++    if (path_base_model) {
														
 
															++        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
														
 
															++        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
														
 
															++        ml->init_mappings(/*prefetch*/ false); // no prefetching
														
 
															++    }
														
 
															++
														
 
															++    struct tensor_meta {
														
 
															++        std::string name;
														
 
															++        ggml_type type;
														
 
															++        int32_t ne[2];
														
 
															++        size_t offset;
														
 
															++    };
														
 
															++    std::map<std::string, tensor_meta> tensor_meta_map;
														
 
															++
														
 
															++    // load all tensor meta
														
 
															++    while (true) {
														
 
															++        if (fin.tell() == fin.size) {
														
 
															++            // eof
														
 
															++            break;
														
 
															++        }
														
 
															++
														
 
															++        int32_t n_dims;
														
 
															++        int32_t name_len;
														
 
															++        int32_t ftype;
														
 
															++
														
 
															++        fin.read_raw(&n_dims, sizeof(n_dims));
														
 
															++        fin.read_raw(&name_len, sizeof(name_len));
														
 
															++        fin.read_raw(&ftype, sizeof(ftype));
														
 
															++
														
 
															++        if (n_dims != 1 && n_dims != 2) {
														
 
															++            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        int32_t ne[2] = { 1, 1 };
														
 
															++        for (int i = 0; i < n_dims; ++i) {
														
 
															++            fin.read_raw(&ne[i], sizeof(ne[i]));
														
 
															++        }
														
 
															++
														
 
															++        std::string name;
														
 
															++        {
														
 
															++            GGML_ASSERT(name_len < GGML_MAX_NAME);
														
 
															++            char buf[GGML_MAX_NAME];
														
 
															++            fin.read_raw(buf, name_len);
														
 
															++            name = std::string(buf, name_len);
														
 
															++        }
														
 
															++
														
 
															++        // check for lora suffix
														
 
															++        std::string lora_suffix;
														
 
															++        if (name.length() > 6) {
														
 
															++            lora_suffix = name.substr(name.length() - 6);
														
 
															++        }
														
 
															++        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
														
 
															++            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        // tensor type
														
 
															++        ggml_type wtype;
														
 
															++        switch (ftype) {
														
 
															++            case 0: wtype = GGML_TYPE_F32;  break;
														
 
															++            case 1: wtype = GGML_TYPE_F16;  break;
														
 
															++            default:
														
 
															++                    {
														
 
															++                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
														
 
															++                                __func__, ftype);
														
 
															++                        return 1;
														
 
															++                    }
														
 
															++        }
														
 
															++
														
 
															++        // data offset
														
 
															++        size_t offset = fin.tell();
														
 
															++        offset = (offset + 31) & -32;
														
 
															++
														
 
															++        // skip tensor data
														
 
															++        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
														
 
															++
														
 
															++        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
														
 
															++    }
														
 
															++
														
 
															++    bool warned = false;
														
 
															++    int n_tensors = 0;
														
 
															++
														
 
															++    // apply
														
 
															++    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
														
 
															++    if (backend_cpu == nullptr) {
														
 
															++        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
														
 
															++        return 1;
														
 
															++    }
														
 
															++    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
														
 
															++
														
 
															++    std::vector<no_init<uint8_t>> read_buf;
														
 
															++    for (const auto & it : model.tensors_by_name) {
														
 
															++        const std::string & base_name = it.first;
														
 
															++        ggml_tensor * model_t = it.second;
														
 
															++
														
 
															++        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
														
 
															++            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
														
 
															++            continue;
														
 
															++        }
														
 
															++
														
 
															++        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
														
 
															++        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
														
 
															++
														
 
															++        ggml_init_params lora_init_params = {
														
 
															++            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
														
 
															++            /* .mem_buffer */ nullptr,
														
 
															++            /* .no_alloc   */ true,
														
 
															++        };
														
 
															++        ggml_context * lora_ctx = ggml_init(lora_init_params);
														
 
															++        if (lora_ctx == nullptr) {
														
 
															++            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
														
 
															++            ggml_backend_free(backend_cpu);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        // create tensors
														
 
															++        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
														
 
															++        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
														
 
															++        ggml_set_name(loraA, metaA.name.c_str());
														
 
															++        ggml_set_name(loraB, metaB.name.c_str());
														
 
															++
														
 
															++        ggml_tensor * base_t;
														
 
															++        if (ml) {
														
 
															++            if (!ml->get_tensor_meta(base_name.c_str())) {
														
 
															++                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
														
 
															++                return 1;
														
 
															++            }
														
 
															++            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
														
 
															++        } else {
														
 
															++            base_t = ggml_dup_tensor(lora_ctx, model_t);
														
 
															++        }
														
 
															++        ggml_set_name(base_t, base_name.c_str());
														
 
															++
														
 
															++        // allocate in backend buffer
														
 
															++        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
														
 
															++        if (lora_buf == nullptr) {
														
 
															++            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        // load tensor data
														
 
															++        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
														
 
															++            read_buf.resize(ggml_nbytes(tensor));
														
 
															++            fin.seek(tensor_meta.offset, SEEK_SET);
														
 
															++            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
														
 
															++            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
														
 
															++        };
														
 
															++        load_tensor(metaA, loraA);
														
 
															++        load_tensor(metaB, loraB);
														
 
															++
														
 
															++        // load base model tensor data
														
 
															++        if (ml) {
														
 
															++            ml->load_data_for(base_t);
														
 
															++        } else {
														
 
															++            ggml_backend_tensor_copy(model_t, base_t);
														
 
															++        }
														
 
															++
														
 
															++        if (ggml_is_quantized(base_t->type) && !warned) {
														
 
															++            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
														
 
															++                            "use a f16 or f32 base model with --lora-base\n", __func__);
														
 
															++            warned = true;
														
 
															++        }
														
 
															++
														
 
															++        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
														
 
															++            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
														
 
															++                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
														
 
															++            ggml_free(lora_ctx);
														
 
															++            ggml_backend_buffer_free(lora_buf);
														
 
															++            ggml_backend_free(backend_cpu);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        auto build_lora_graph = [&]() {
														
 
															++            // w = w + BA*s
														
 
															++            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
														
 
															++            ggml_set_name(BA, "BA");
														
 
															++
														
 
															++            if (scaling != 1.0f) {
														
 
															++                BA = ggml_scale(lora_ctx, BA, scaling);
														
 
															++                ggml_set_name(BA, "BA_scaled");
														
 
															++            }
														
 
															++
														
 
															++            ggml_tensor * r;
														
 
															++            r = ggml_add_inplace(lora_ctx, base_t, BA);
														
 
															++            ggml_set_name(r, "r_add");
														
 
															++
														
 
															++            if (base_t->type != model_t->type) {
														
 
															++                // convert the result to the model type
														
 
															++                r = ggml_cast(lora_ctx, r, model_t->type);
														
 
															++                ggml_set_name(r, "r_cast");
														
 
															++            }
														
 
															++
														
 
															++            return r;
														
 
															++        };
														
 
															++
														
 
															++        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
														
 
															++        ggml_tensor * r = build_lora_graph();
														
 
															++        ggml_build_forward_expand(gf, r);
														
 
															++
														
 
															++        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
														
 
															++        if (graph_buf == nullptr) {
														
 
															++            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
														
 
															++            ggml_free(lora_ctx);
														
 
															++            ggml_backend_buffer_free(lora_buf);
														
 
															++            ggml_backend_free(backend_cpu);
														
 
															++            return 1;
														
 
															++        }
														
 
															++
														
 
															++        ggml_backend_graph_compute(backend_cpu, gf);
														
 
															++
														
 
															++        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
														
 
															++
														
 
															++#if 0
														
 
															++        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
														
 
															++        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
														
 
															++
														
 
															++        // sched compute
														
 
															++        ggml_build_forward_expand(gf, build_graph());
														
 
															++        ggml_backend_sched_init_measure(sched, gf);
														
 
															++
														
 
															++        // create the graph again, since the previous one was destroyed by the measure
														
 
															++        ggml_graph_clear(gf);
														
 
															++        ggml_build_forward_expand(gf, build_graph());
														
 
															++        ggml_backend_sched_graph_compute(sched, gf);
														
 
															++        ggml_backend_sched_free(sched);
														
 
															++#endif
														
 
															++
														
 
															++        ggml_backend_buffer_free(lora_buf);
														
 
															++        ggml_backend_buffer_free(graph_buf);
														
 
															++        ggml_free(lora_ctx);
														
 
															++
														
 
															++        n_tensors++;
														
 
															++        if (n_tensors % 4 == 0) {
														
 
															++            LLAMA_LOG_INFO(".");
														
 
															++        }
														
 
															++    }
														
 
															++
														
 
															++    ggml_backend_free(backend_cpu);
														
 
															++
														
 
															++    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
														
 
															++    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
														
 
															++
														
 
															++    return 0;
														
 
															++}
														
 
															++
														
 
															++int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
														
 
															++    try {
														
 
															++        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
														
 
															++    } catch (const std::exception & err) {
														
 
															++        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
														
 
															++        return 1;
														
 
															++    }
														
 
															++}
														
 
															+\ No newline at end of file
														
--- a/llm/patches/10-tekken.diff
+++ b/llm/patches/10-tekken.diff
@@ -1,43 +0,0 @@
 
															-diff --git a/include/llama.h b/include/llama.h
														
 
															-index bb4b05ba..a92174e0 100644
														
 
															---- a/include/llama.h
														
 
															-+++ b/include/llama.h
														
 
															-@@ -92,6 +92,7 @@ extern "C" {
														
 
															-         LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
														
 
															-         LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
														
 
															-         LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
														
 
															-+        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
														
 
															-     };
														
 
															- 
														
 
															-     // note: these values should be synchronized with ggml_rope
														
 
															-diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 18364976..435b6fe5 100644
														
 
															---- a/src/llama.cpp
														
 
															-+++ b/src/llama.cpp
														
 
															-@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
														
 
															-             } else if (
														
 
															-                 tokenizer_pre == "jais") {
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
														
 
															-+            } else if (
														
 
															-+                tokenizer_pre == "tekken") {
														
 
															-+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
														
 
															-+                vocab.tokenizer_clean_spaces = false;
														
 
															-+                vocab.tokenizer_ignore_merges = true;
														
 
															-+                vocab.tokenizer_add_bos = true;
														
 
															-             } else {
														
 
															-                 LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															-@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
														
 
															-                     " ?[^(\\s|.,!?…。，、।۔،)]+",
														
 
															-                 };
														
 
															-                 break;
														
 
															-+            case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
														
 
															-+                    // original regex from tokenizer.json
														
 
															-+                    // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
														
 
															-+                regex_exprs = {
														
 
															-+                    "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
														
 
															-+                };
														
 
															-+                break;
														
 
															-             default:
														
 
															-                 // default regex for BPE tokenization pre-processing
														
 
															-                 regex_exprs = {
														
--- a/llm/patches/11-embd_kv.diff
+++ b/llm/patches/11-embd_kv.diff
@@ -1,19 +0,0 @@
 
															-diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 2b9ace28..e60d3d8d 100644
														
 
															---- a/src/llama.cpp
														
 
															-+++ b/src/llama.cpp
														
 
															-@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
														
 
															- 
														
 
															-                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
														
 
															- 
														
 
															--                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
														
 
															--                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
														
 
															--                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
														
 
															--                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
														
 
															-+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd,  n_embd_head_k * n_head});
														
 
															-+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa});
														
 
															-+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa});
														
 
															-+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
														
 
															- 
														
 
															-                         // optional bias tensors
														
 
															-                         layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     llama_model_loader::TENSOR_NOT_REQUIRED);
	`@@ -1 +1 @@`
	`-Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584`
			`+Subproject commit d94c6e0ccbd29ee1ba4f44e9caa8682ad94df9fa`