8 місяців тому · 5e2653f9fe
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -425,7 +425,7 @@ struct llama_server_context
 
															         n_ctx = llama_n_ctx(ctx);
														
 
															-        add_bos_token = llama_should_add_bos_token(model);
														
 
															+        add_bos_token = llama_add_bos_token(model);
														
 
															         return true;
														
 
															     }
														
@@ -1031,7 +1031,7 @@ struct llama_server_context
 
															                 continue;
														
 
															             }
														
 
															-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
														
 
															+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
														
 
															                 LOG_TEE("Error processing the given image");
														
 
															                 return false;
														
 
															             }
														
@@ -2014,7 +2014,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 
															     printf("options:\n");
														
 
															     printf("  -h, --help                show this help message and exit\n");
														
 
															     printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
														
 
															-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
														
 
															+    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.cpuparams.n_threads);
														
 
															     printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
														
 
															     printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
														
 
															     printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
														
@@ -2287,7 +2287,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
 
															                 invalid_param = true;
														
 
															                 break;
														
 
															             }
														
 
															-            params.n_threads = std::stoi(argv[i]);
														
 
															+            params.cpuparams.n_threads = std::stoi(argv[i]);
														
 
															         }
														
 
															         else if (arg == "--grp-attn-n" || arg == "-gan")
														
 
															         {
														
@@ -2315,7 +2315,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
 
															                 invalid_param = true;
														
 
															                 break;
														
 
															             }
														
 
															-            params.n_threads_batch = std::stoi(argv[i]);
														
 
															+            params.cpuparams_batch.n_threads = std::stoi(argv[i]);
														
 
															         }
														
 
															         else if (arg == "--threads-http")
														
 
															         {
														
@@ -2626,6 +2626,11 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, g
 
															         params.kv_overrides.back().key[0] = 0;
														
 
															     }
														
 
															+    postprocess_cpu_params(params.cpuparams, nullptr);
														
 
															+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
														
 
															+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
														
 
															+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
														
 
															+
														
 
															     if (invalid_param)
														
 
															     {
														
 
															         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
														
@@ -2775,8 +2780,8 @@ int main(int argc, char **argv) {
 
															                             {"commit", LLAMA_COMMIT}});
														
 
															     LOG_INFO("system info", {
														
 
															-                                {"n_threads", params.n_threads},
														
 
															-                                {"n_threads_batch", params.n_threads_batch},
														
 
															+                                {"n_threads", params.cpuparams.n_threads},
														
 
															+                                {"n_threads_batch", params.cpuparams_batch.n_threads},
														
 
															                                 {"total_threads", std::thread::hardware_concurrency()},
														
 
															                                 {"system_info", llama_print_system_info()},
														
 
															                             });
														
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -19,7 +19,7 @@ sign() {
 
															     fi
														
 
															 }
														
 
															-COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
														
 
															+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
														
 
															 case "${GOARCH}" in
														
 
															 "amd64")
														
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
 
															-Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f
														
 
															+Subproject commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177
														
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,8 +1,8 @@
 
															 diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index a207451f..2ddf431d 100644
														
 
															+index 88355971..dd7d41ed 100644
														
 
															 --- a/src/llama.cpp
														
 
															 +++ b/src/llama.cpp
														
 
															-@@ -5347,16 +5347,7 @@ static void llm_load_vocab(
														
 
															+@@ -6083,16 +6083,7 @@ static void llm_load_vocab(
														
 
															          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
														
 
															              vocab.tokenizer_add_space_prefix = false;
														
 
															              vocab.tokenizer_clean_spaces = true;
														
@@ -20,9 +20,9 @@ index a207451f..2ddf431d 100644
 
															                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
														
 
															              } else if (
														
 
															                      tokenizer_pre == "llama3"   ||
														
 
															-@@ -5443,7 +5434,8 @@ static void llm_load_vocab(
														
 
															-                 tokenizer_pre == "codeshell") {
														
 
															-                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
														
 
															+@@ -6188,7 +6179,8 @@ static void llm_load_vocab(
														
 
															+                 tokenizer_pre == "exaone") {
														
 
															+                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
														
 
															              } else {
														
 
															 -                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
														
 
															 +                LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
														
--- a/llm/patches/06-embeddings.diff
+++ b/llm/patches/06-embeddings.diff
@@ -1,37 +1,36 @@
 
															 diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 1fe2b9f7..a43312a7 100644
														
 
															+index 88355971..d7db689b 100644
														
 
															 --- a/src/llama.cpp
														
 
															 +++ b/src/llama.cpp
														
 
															-@@ -13689,7 +13689,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
														
 
															+@@ -15906,7 +15906,7 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
														
 
															      const auto n_embd  = hparams.n_embd;
														
 
															      // TODO: use a per-batch flag for logits presence instead
														
 
															 -    const bool has_logits = !cparams.embeddings;
														
 
															 +    const bool has_logits =  cparams.causal_attn;
														
 
															-     const bool has_embd   =  lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE));
														
 
															+     const bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
														
 
															      const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
														
 
															-@@ -13959,17 +13959,25 @@ static int llama_decode_internal(
														
 
															+@@ -16175,20 +16175,23 @@ static int llama_decode_internal(
														
 
															              // no output
														
 
															              res  = nullptr;
														
 
															              embd = nullptr;
														
 
															 -        } else if (cparams.embeddings) {
														
 
															--            res = nullptr; // do not extract logits for embedding case
														
 
															--            embd = gf->nodes[gf->n_nodes - 1];
														
 
															--            if (strcmp(embd->name, "result_embd_pooled") != 0) {
														
 
															--                embd = gf->nodes[gf->n_nodes - 2];
														
 
															+-            res  = nullptr; // do not extract logits for embedding case
														
 
															+-            embd = nullptr;
														
 
															 +        }
														
 
															 +
														
 
															 +        if (cparams.embeddings) {
														
 
															-+            for (int i = gf->n_nodes - 1; i >= 0; --i) {
														
 
															+             for (int i = gf->n_nodes - 1; i >= 0; --i) {
														
 
															+-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
														
 
															+-                    embd = gf->nodes[i];
														
 
															 +                embd = gf->nodes[i];
														
 
															 +                if (strcmp(embd->name, "result_embd_pooled") == 0) {
														
 
															-+                    break;
														
 
															-+                }
														
 
															+                     break;
														
 
															+                 }
														
 
															              }
														
 
															-             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
														
 
															--        } else {
														
 
															-+         } else {
														
 
															+-            GGML_ASSERT(embd != nullptr && "missing embeddings tensor");
														
 
															+         } else {
														
 
															              embd = nullptr; // do not extract embeddings when not needed
														
 
															              GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
														
 
															          }
														
@@ -39,7 +38,6 @@ index 1fe2b9f7..a43312a7 100644
 
															 +        if (!cparams.causal_attn) {
														
 
															 +            res = nullptr; // do not extract logits when not needed
														
 
															 +        }
														
 
															-+
														
 
															          // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
														
 
															          ggml_backend_sched_alloc_graph(lctx.sched, gf);
														
--- a/llm/patches/09-lora.diff
+++ b/llm/patches/09-lora.diff
@@ -1,350 +0,0 @@
 
															-diff --git a/common/common.cpp b/common/common.cpp
														
 
															-index 2e8374d5..70d0afde 100644
														
 
															---- a/common/common.cpp
														
 
															-+++ b/common/common.cpp
														
 
															-@@ -2110,9 +2110,21 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
														
 
															-         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
														
 
															-         if (loaded_la.adapter == nullptr) {
														
 
															-             fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
														
 
															--            llama_free(lctx);
														
 
															--            llama_free_model(model);
														
 
															--            return iparams;
														
 
															-+
														
 
															-+            // if that fails, try loading as ggla for compatibility
														
 
															-+            int err = llama_model_apply_lora_from_file(model,
														
 
															-+                                                    la.path.c_str(),
														
 
															-+                                                    la.scale,
														
 
															-+                                                    nullptr,
														
 
															-+                                                    params.n_threads);
														
 
															-+            if (err != 0) {
														
 
															-+                fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
														
 
															-+                llama_free(lctx);
														
 
															-+                llama_free_model(model);
														
 
															-+                return iparams;
														
 
															-+            } else {
														
 
															-+                break;
														
 
															-+            }
														
 
															-         }
														
 
															-         iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
														
 
															-     }
														
 
															-diff --git a/include/llama.h b/include/llama.h
														
 
															-index 93fd77ca..b0fb37a6 100644
														
 
															---- a/include/llama.h
														
 
															-+++ b/include/llama.h
														
 
															-@@ -1160,6 +1160,20 @@ extern "C" {
														
 
															- 
														
 
															-     LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
														
 
															- 
														
 
															-+    // Apply a LoRA adapter to a loaded model
														
 
															-+    // path_base_model is the path to a higher quality model to use as a base for
														
 
															-+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
														
 
															-+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
														
 
															-+    // will be applied on top of the previous one
														
 
															-+    // Returns 0 on success
														
 
															-+    LLAMA_API int32_t llama_model_apply_lora_from_file(
														
 
															-+            const struct llama_model * model,
														
 
															-+                            const char * path_lora,
														
 
															-+                                float   scale,
														
 
															-+                            const char * path_base_model,
														
 
															-+                                int32_t   n_threads);
														
 
															-+
														
 
															-+
														
 
															- #ifdef __cplusplus
														
 
															- }
														
 
															- #endif
														
 
															-diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index 80a0dd0f..9d7b0e17 100644
														
 
															---- a/src/llama.cpp
														
 
															-+++ b/src/llama.cpp
														
 
															-@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
														
 
															-     fputs(text, stderr);
														
 
															-     fflush(stderr);
														
 
															- }
														
 
															-+
														
 
															-+static int llama_apply_lora_from_file_internal(
														
 
															-+    const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
														
 
															-+) {
														
 
															-+    LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
														
 
															-+
														
 
															-+    const int64_t t_start_lora_us = ggml_time_us();
														
 
															-+
														
 
															-+    llama_file fin(path_lora, "rb");
														
 
															-+
														
 
															-+    // verify magic and version
														
 
															-+    {
														
 
															-+        uint32_t magic = fin.read_u32();
														
 
															-+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
														
 
															-+            LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        uint32_t format_version = fin.read_u32();
														
 
															-+        if (format_version != 1) {
														
 
															-+            LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+    }
														
 
															-+
														
 
															-+    int32_t lora_r = fin.read_u32();
														
 
															-+    int32_t lora_alpha = fin.read_u32();
														
 
															-+    float scaling = scale * (float)lora_alpha / (float)lora_r;
														
 
															-+
														
 
															-+    LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
														
 
															-+
														
 
															-+    // load base model
														
 
															-+    std::unique_ptr<llama_model_loader> ml;
														
 
															-+    if (path_base_model) {
														
 
															-+        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
														
 
															-+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
														
 
															-+        ml->init_mappings(/*prefetch*/ false); // no prefetching
														
 
															-+    }
														
 
															-+
														
 
															-+    struct tensor_meta {
														
 
															-+        std::string name;
														
 
															-+        ggml_type type;
														
 
															-+        int32_t ne[2];
														
 
															-+        size_t offset;
														
 
															-+    };
														
 
															-+    std::map<std::string, tensor_meta> tensor_meta_map;
														
 
															-+
														
 
															-+    // load all tensor meta
														
 
															-+    while (true) {
														
 
															-+        if (fin.tell() == fin.size) {
														
 
															-+            // eof
														
 
															-+            break;
														
 
															-+        }
														
 
															-+
														
 
															-+        int32_t n_dims;
														
 
															-+        int32_t name_len;
														
 
															-+        int32_t ftype;
														
 
															-+
														
 
															-+        fin.read_raw(&n_dims, sizeof(n_dims));
														
 
															-+        fin.read_raw(&name_len, sizeof(name_len));
														
 
															-+        fin.read_raw(&ftype, sizeof(ftype));
														
 
															-+
														
 
															-+        if (n_dims != 1 && n_dims != 2) {
														
 
															-+            LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        int32_t ne[2] = { 1, 1 };
														
 
															-+        for (int i = 0; i < n_dims; ++i) {
														
 
															-+            fin.read_raw(&ne[i], sizeof(ne[i]));
														
 
															-+        }
														
 
															-+
														
 
															-+        std::string name;
														
 
															-+        {
														
 
															-+            GGML_ASSERT(name_len < GGML_MAX_NAME);
														
 
															-+            char buf[GGML_MAX_NAME];
														
 
															-+            fin.read_raw(buf, name_len);
														
 
															-+            name = std::string(buf, name_len);
														
 
															-+        }
														
 
															-+
														
 
															-+        // check for lora suffix
														
 
															-+        std::string lora_suffix;
														
 
															-+        if (name.length() > 6) {
														
 
															-+            lora_suffix = name.substr(name.length() - 6);
														
 
															-+        }
														
 
															-+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
														
 
															-+            LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        // tensor type
														
 
															-+        ggml_type wtype;
														
 
															-+        switch (ftype) {
														
 
															-+            case 0: wtype = GGML_TYPE_F32;  break;
														
 
															-+            case 1: wtype = GGML_TYPE_F16;  break;
														
 
															-+            default:
														
 
															-+                    {
														
 
															-+                        LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
														
 
															-+                                __func__, ftype);
														
 
															-+                        return 1;
														
 
															-+                    }
														
 
															-+        }
														
 
															-+
														
 
															-+        // data offset
														
 
															-+        size_t offset = fin.tell();
														
 
															-+        offset = (offset + 31) & -32;
														
 
															-+
														
 
															-+        // skip tensor data
														
 
															-+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
														
 
															-+
														
 
															-+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
														
 
															-+    }
														
 
															-+
														
 
															-+    bool warned = false;
														
 
															-+    int n_tensors = 0;
														
 
															-+
														
 
															-+    // apply
														
 
															-+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
														
 
															-+    if (backend_cpu == nullptr) {
														
 
															-+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
														
 
															-+        return 1;
														
 
															-+    }
														
 
															-+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
														
 
															-+
														
 
															-+    std::vector<no_init<uint8_t>> read_buf;
														
 
															-+    for (const auto & it : model.tensors_by_name) {
														
 
															-+        const std::string & base_name = it.first;
														
 
															-+        ggml_tensor * model_t = it.second;
														
 
															-+
														
 
															-+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
														
 
															-+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
														
 
															-+            continue;
														
 
															-+        }
														
 
															-+
														
 
															-+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
														
 
															-+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
														
 
															-+
														
 
															-+        ggml_init_params lora_init_params = {
														
 
															-+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
														
 
															-+            /* .mem_buffer */ nullptr,
														
 
															-+            /* .no_alloc   */ true,
														
 
															-+        };
														
 
															-+        ggml_context * lora_ctx = ggml_init(lora_init_params);
														
 
															-+        if (lora_ctx == nullptr) {
														
 
															-+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
														
 
															-+            ggml_backend_free(backend_cpu);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        // create tensors
														
 
															-+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
														
 
															-+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
														
 
															-+        ggml_set_name(loraA, metaA.name.c_str());
														
 
															-+        ggml_set_name(loraB, metaB.name.c_str());
														
 
															-+
														
 
															-+        ggml_tensor * base_t;
														
 
															-+        if (ml) {
														
 
															-+            if (!ml->get_tensor_meta(base_name.c_str())) {
														
 
															-+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
														
 
															-+                return 1;
														
 
															-+            }
														
 
															-+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
														
 
															-+        } else {
														
 
															-+            base_t = ggml_dup_tensor(lora_ctx, model_t);
														
 
															-+        }
														
 
															-+        ggml_set_name(base_t, base_name.c_str());
														
 
															-+
														
 
															-+        // allocate in backend buffer
														
 
															-+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
														
 
															-+        if (lora_buf == nullptr) {
														
 
															-+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        // load tensor data
														
 
															-+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
														
 
															-+            read_buf.resize(ggml_nbytes(tensor));
														
 
															-+            fin.seek(tensor_meta.offset, SEEK_SET);
														
 
															-+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
														
 
															-+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
														
 
															-+        };
														
 
															-+        load_tensor(metaA, loraA);
														
 
															-+        load_tensor(metaB, loraB);
														
 
															-+
														
 
															-+        // load base model tensor data
														
 
															-+        if (ml) {
														
 
															-+            ml->load_data_for(base_t);
														
 
															-+        } else {
														
 
															-+            ggml_backend_tensor_copy(model_t, base_t);
														
 
															-+        }
														
 
															-+
														
 
															-+        if (ggml_is_quantized(base_t->type) && !warned) {
														
 
															-+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
														
 
															-+                            "use a f16 or f32 base model with --lora-base\n", __func__);
														
 
															-+            warned = true;
														
 
															-+        }
														
 
															-+
														
 
															-+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
														
 
															-+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
														
 
															-+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
														
 
															-+            ggml_free(lora_ctx);
														
 
															-+            ggml_backend_buffer_free(lora_buf);
														
 
															-+            ggml_backend_free(backend_cpu);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        auto build_lora_graph = [&]() {
														
 
															-+            // w = w + BA*s
														
 
															-+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
														
 
															-+            ggml_set_name(BA, "BA");
														
 
															-+
														
 
															-+            if (scaling != 1.0f) {
														
 
															-+                BA = ggml_scale(lora_ctx, BA, scaling);
														
 
															-+                ggml_set_name(BA, "BA_scaled");
														
 
															-+            }
														
 
															-+
														
 
															-+            ggml_tensor * r;
														
 
															-+            r = ggml_add_inplace(lora_ctx, base_t, BA);
														
 
															-+            ggml_set_name(r, "r_add");
														
 
															-+
														
 
															-+            if (base_t->type != model_t->type) {
														
 
															-+                // convert the result to the model type
														
 
															-+                r = ggml_cast(lora_ctx, r, model_t->type);
														
 
															-+                ggml_set_name(r, "r_cast");
														
 
															-+            }
														
 
															-+
														
 
															-+            return r;
														
 
															-+        };
														
 
															-+
														
 
															-+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
														
 
															-+        ggml_tensor * r = build_lora_graph();
														
 
															-+        ggml_build_forward_expand(gf, r);
														
 
															-+
														
 
															-+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
														
 
															-+        if (graph_buf == nullptr) {
														
 
															-+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
														
 
															-+            ggml_free(lora_ctx);
														
 
															-+            ggml_backend_buffer_free(lora_buf);
														
 
															-+            ggml_backend_free(backend_cpu);
														
 
															-+            return 1;
														
 
															-+        }
														
 
															-+
														
 
															-+        ggml_backend_graph_compute(backend_cpu, gf);
														
 
															-+
														
 
															-+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
														
 
															-+
														
 
															-+#if 0
														
 
															-+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
														
 
															-+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
														
 
															-+
														
 
															-+        // sched compute
														
 
															-+        ggml_build_forward_expand(gf, build_graph());
														
 
															-+        ggml_backend_sched_init_measure(sched, gf);
														
 
															-+
														
 
															-+        // create the graph again, since the previous one was destroyed by the measure
														
 
															-+        ggml_graph_clear(gf);
														
 
															-+        ggml_build_forward_expand(gf, build_graph());
														
 
															-+        ggml_backend_sched_graph_compute(sched, gf);
														
 
															-+        ggml_backend_sched_free(sched);
														
 
															-+#endif
														
 
															-+
														
 
															-+        ggml_backend_buffer_free(lora_buf);
														
 
															-+        ggml_backend_buffer_free(graph_buf);
														
 
															-+        ggml_free(lora_ctx);
														
 
															-+
														
 
															-+        n_tensors++;
														
 
															-+        if (n_tensors % 4 == 0) {
														
 
															-+            LLAMA_LOG_INFO(".");
														
 
															-+        }
														
 
															-+    }
														
 
															-+
														
 
															-+    ggml_backend_free(backend_cpu);
														
 
															-+
														
 
															-+    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
														
 
															-+    LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
														
 
															-+
														
 
															-+    return 0;
														
 
															-+}
														
 
															-+
														
 
															-+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
														
 
															-+    try {
														
 
															-+        return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
														
 
															-+    } catch (const std::exception & err) {
														
 
															-+        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
														
 
															-+        return 1;
														
 
															-+    }
														
 
															-+}
														
 
															-\ No newline at end of file
														
--- a/llm/patches/11-phi3-sliding-window.diff
+++ b/llm/patches/11-phi3-sliding-window.diff
@@ -1,43 +0,0 @@
 
															-From 6eedae4cf2fcc8015dac79cb3f28f61fcabacab2 Mon Sep 17 00:00:00 2001
														
 
															-From: Michael Yang <mxyng@pm.me>
														
 
															-Date: Wed, 31 Jul 2024 14:57:04 -0700
														
 
															-Subject: [PATCH] phi3 sliding window
														
 
															-
														
 
															----
														
 
															- src/llama.cpp | 6 +++---
														
 
															- 1 file changed, 3 insertions(+), 3 deletions(-)
														
 
															-
														
 
															-diff --git a/src/llama.cpp b/src/llama.cpp
														
 
															-index a207451f..f2872d4e 100644
														
 
															---- a/src/llama.cpp
														
 
															-+++ b/src/llama.cpp
														
 
															-@@ -4893,7 +4893,7 @@ static void llm_load_hparams(
														
 
															-             } break;
														
 
															-         case LLM_ARCH_PHI3:
														
 
															-             {
														
 
															--                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
														
 
															-+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
														
 
															-                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
														
 
															- 
														
 
															-                 switch (hparams.n_layer) {
														
 
															-@@ -10762,7 +10762,7 @@ struct llm_build_context {
														
 
															-         struct ggml_tensor * inp_pos = build_inp_pos();
														
 
															- 
														
 
															-         // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
														
 
															--        struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
														
 
															-+        struct ggml_tensor * KQ_mask = hparams.n_swa > 0 ? build_inp_KQ_mask_swa() : build_inp_KQ_mask();
														
 
															- 
														
 
															-         for (int il = 0; il < n_layer; ++il) {
														
 
															-             auto residual = inpL;
														
 
															-@@ -10820,7 +10820,7 @@ struct llm_build_context {
														
 
															- 
														
 
															-                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
														
 
															-                         model.layers[il].wo, model.layers[il].bo,
														
 
															--                        Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
														
 
															-+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
														
 
															-             }
														
 
															- 
														
 
															-             if (il == n_layer - 1) {
														
 
															--- 
														
 
															-2.45.2
														
 
															-
	`@@ -1 +1 @@`
	`-Subproject commit 1e6f6554aa11fa10160a5fda689e736c3c34169f`
			`+Subproject commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177`