0001-llama-3.1-rope-scaling.diff 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
  2. From: Michael Yang <mxyng@pm.me>
  3. Date: Tue, 23 Jul 2024 14:33:29 -0700
  4. Subject: [PATCH] llama 3.1 rope scaling
  5. ---
  6. src/llama.cpp | 14 ++++++++++++--
  7. 1 file changed, 12 insertions(+), 2 deletions(-)
  8. diff --git a/src/llama.cpp b/src/llama.cpp
  9. index 8fe51971..a9969df8 100644
  10. --- a/src/llama.cpp
  11. +++ b/src/llama.cpp
  12. @@ -2472,6 +2472,7 @@ struct llama_layer {
  13. // long rope factors
  14. struct ggml_tensor * rope_long = nullptr;
  15. struct ggml_tensor * rope_short = nullptr;
  16. + struct ggml_tensor * rope_freqs = nullptr;
  17. // bitnet scale
  18. struct ggml_tensor * wq_scale;
  19. @@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
  20. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  21. + layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  22. +
  23. if (n_expert == 0) {
  24. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  25. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  26. @@ -8620,6 +8623,10 @@ struct llm_build_context {
  27. // choose long/short freq factors based on the context size
  28. const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
  29. + if (model.layers[il].rope_freqs != nullptr) {
  30. + return model.layers[il].rope_freqs;
  31. + }
  32. +
  33. if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
  34. return model.layers[il].rope_long;
  35. }
  36. @@ -8814,6 +8821,9 @@ struct llm_build_context {
  37. // self-attention
  38. {
  39. + // rope freq factors for llama3; may return nullptr for llama2 and other models
  40. + struct ggml_tensor * rope_factors = build_rope_factors(il);
  41. +
  42. // compute Q and K and RoPE them
  43. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  44. cb(Qcur, "Qcur", il);
  45. @@ -8837,14 +8847,14 @@ struct llm_build_context {
  46. }
  47. Qcur = ggml_rope_ext(
  48. - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  49. + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  50. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  51. ext_factor, attn_factor, beta_fast, beta_slow
  52. );
  53. cb(Qcur, "Qcur", il);
  54. Kcur = ggml_rope_ext(
  55. - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  56. + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  57. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  58. ext_factor, attn_factor, beta_fast, beta_slow
  59. );
  60. --
  61. 2.45.2