9 ヶ月前 · bb46bbcf5e
--- a/llm/patches/0001-llama-3.1-rope-scaling.diff
+++ b/llm/patches/0001-llama-3.1-rope-scaling.diff
@@ -0,0 +1,71 @@
 
				+From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
			
 
				+From: Michael Yang <mxyng@pm.me>
			
 
				+Date: Tue, 23 Jul 2024 14:33:29 -0700
			
 
				+Subject: [PATCH] llama 3.1 rope scaling
			
 
				+
			
 
				+---
			
 
				+ src/llama.cpp | 14 ++++++++++++--
			
 
				+ 1 file changed, 12 insertions(+), 2 deletions(-)
			
 
				+
			
 
				+diff --git a/src/llama.cpp b/src/llama.cpp
			
 
				+index 8fe51971..a9969df8 100644
			
 
				+--- a/src/llama.cpp
			
 
				++++ b/src/llama.cpp
			
 
				+@@ -2472,6 +2472,7 @@ struct llama_layer {
			
 
				+     // long rope factors
			
 
				+     struct ggml_tensor * rope_long  = nullptr;
			
 
				+     struct ggml_tensor * rope_short = nullptr;
			
 
				++    struct ggml_tensor * rope_freqs = nullptr;
			
 
				+ 
			
 
				+     // bitnet scale
			
 
				+     struct ggml_tensor * wq_scale;
			
 
				+@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
			
 
				+ 
			
 
				+                         layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
			
 
				+ 
			
 
				++                        layer.rope_freqs  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS,  "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
			
 
				++
			
 
				+                         if (n_expert == 0) {
			
 
				+                             layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
			
 
				+                             layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
			
 
				+@@ -8620,6 +8623,10 @@ struct llm_build_context {
			
 
				+         // choose long/short freq factors based on the context size
			
 
				+         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
			
 
				+ 
			
 
				++        if (model.layers[il].rope_freqs != nullptr) {
			
 
				++            return model.layers[il].rope_freqs;
			
 
				++        }
			
 
				++
			
 
				+         if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
			
 
				+             return model.layers[il].rope_long;
			
 
				+         }
			
 
				+@@ -8814,6 +8821,9 @@ struct llm_build_context {
			
 
				+ 
			
 
				+             // self-attention
			
 
				+             {
			
 
				++                // rope freq factors for llama3; may return nullptr for llama2 and other models
			
 
				++                struct ggml_tensor * rope_factors = build_rope_factors(il);
			
 
				++
			
 
				+                 // compute Q and K and RoPE them
			
 
				+                 struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
			
 
				+                 cb(Qcur, "Qcur", il);
			
 
				+@@ -8837,14 +8847,14 @@ struct llm_build_context {
			
 
				+                 }
			
 
				+ 
			
 
				+                 Qcur = ggml_rope_ext(
			
 
				+-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
			
 
				++                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
			
 
				+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
			
 
				+                     ext_factor, attn_factor, beta_fast, beta_slow
			
 
				+                 );
			
 
				+                 cb(Qcur, "Qcur", il);
			
 
				+ 
			
 
				+                 Kcur = ggml_rope_ext(
			
 
				+-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
			
 
				++                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
			
 
				+                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
			
 
				+                     ext_factor, attn_factor, beta_fast, beta_slow
			
 
				+                 );
			
 
				+-- 
			
 
				+2.45.2
			
 
				+