0011-llama-Ensure-KV-cache-is-fully-defragmented.patch 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
  2. From: Jesse Gross <jesse@ollama.com>
  3. Date: Fri, 13 Dec 2024 16:11:59 -0800
  4. Subject: [PATCH] llama: Ensure KV cache is fully defragmented.
  5. Sometimes the KV cache requires defragmentation even without
  6. triggering the threshold heuristic. In this case, decoding
  7. will not being able to find a KV cache slot. This is particularly
  8. difficult for the caller to handle if it happens in between
  9. ubatches. To avoid this, we should immediately trigger a defrag.
  10. In addition, a heavily fragmented cache can require more than
  11. max_moves to defragment. Currently, we stop when we hit the limit
  12. but this can leave a cache that still does not have adequate space
  13. even after defragmentation is triggered. Instead, we should do
  14. multiple batches of processing until everything is complete.
  15. ---
  16. src/llama.cpp | 99 ++++++++++++++++++++++++---------------------------
  17. 1 file changed, 46 insertions(+), 53 deletions(-)
  18. diff --git a/src/llama.cpp b/src/llama.cpp
  19. index 8f7902df..01854fce 100644
  20. --- a/src/llama.cpp
  21. +++ b/src/llama.cpp
  22. @@ -1054,6 +1054,13 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
  23. return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
  24. }
  25. +// block of KV slots to move when defragging
  26. +struct llama_kv_defrag_move {
  27. + uint32_t src;
  28. + uint32_t dst;
  29. + uint32_t len;
  30. +};
  31. +
  32. struct llm_build_context {
  33. const llama_model & model;
  34. llama_context & lctx;
  35. @@ -1230,35 +1237,23 @@ struct llm_build_context {
  36. return gf;
  37. }
  38. - struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
  39. + struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
  40. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
  41. - for (uint32_t i = 0; i < ids.size(); ++i) {
  42. - const uint32_t id = ids[i];
  43. -
  44. - if (i == id || id == ids.size()) {
  45. - continue;
  46. - }
  47. -
  48. - uint32_t nm = 1;
  49. -
  50. - while (i + nm < ids.size() && ids[i + nm] == id + nm) {
  51. - nm++;
  52. - }
  53. -
  54. + for (const auto & move : moves) {
  55. for (int il = 0; il < n_layer; ++il) {
  56. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  57. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  58. ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
  59. - n_embd_k_gqa, nm,
  60. + n_embd_k_gqa, move.len,
  61. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  62. - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
  63. + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
  64. ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
  65. - n_embd_k_gqa, nm,
  66. + n_embd_k_gqa, move.len,
  67. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  68. - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
  69. + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
  70. ggml_tensor * view_v_src;
  71. ggml_tensor * view_v_dst;
  72. @@ -1266,31 +1261,29 @@ struct llm_build_context {
  73. if (flash_attn) {
  74. // NOTE: the V cache is not transposed when using flash attention
  75. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  76. - n_embd_v_gqa, nm,
  77. + n_embd_v_gqa, move.len,
  78. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  79. - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
  80. + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
  81. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  82. - n_embd_v_gqa, nm,
  83. + n_embd_v_gqa, move.len,
  84. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  85. - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
  86. + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
  87. } else {
  88. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  89. - nm, n_embd_v_gqa,
  90. + move.len, n_embd_v_gqa,
  91. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  92. - ggml_row_size(kv_self.v_l[il]->type, i));
  93. + ggml_row_size(kv_self.v_l[il]->type, move.src));
  94. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  95. - nm, n_embd_v_gqa,
  96. + move.len, n_embd_v_gqa,
  97. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  98. - ggml_row_size(kv_self.v_l[il]->type, id));
  99. + ggml_row_size(kv_self.v_l[il]->type, move.dst));
  100. }
  101. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
  102. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
  103. }
  104. -
  105. - i += nm - 1;
  106. }
  107. //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
  108. @@ -8508,7 +8501,7 @@ struct llm_build_context {
  109. }
  110. };
  111. -static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
  112. +static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
  113. llama_ubatch dummy = {};
  114. dummy.equal_seqs = true;
  115. @@ -8518,7 +8511,7 @@ static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const
  116. llm.init();
  117. - struct ggml_cgraph * result = llm.build_defrag(ids);
  118. + struct ggml_cgraph * result = llm.build_defrag(moves);
  119. llm.free();
  120. @@ -8956,7 +8949,12 @@ static int llama_prepare_ubatch(
  121. kv_self.head = 0;
  122. }
  123. - const auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
  124. + auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
  125. + if (!slot) {
  126. + llama_kv_cache_defrag(kv_self);
  127. + llama_kv_cache_update(&lctx);
  128. + slot = llama_kv_cache_find_slot(kv_self, ubatch);
  129. + }
  130. if (!slot) {
  131. return 1;
  132. }
  133. @@ -9431,8 +9429,8 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
  134. //const int64_t t_start = ggml_time_us();
  135. - // number of cells moved
  136. - uint32_t n_moves = 0;
  137. + // groups of cells moved
  138. + std::vector<struct llama_kv_defrag_move> moves;
  139. // each move requires 6*n_layer tensors (see build_defrag)
  140. // - source view, destination view, copy operation
  141. @@ -9496,19 +9494,11 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
  142. // are we moving a continuous block of memory?
  143. bool cont = false;
  144. - // should we stop searching for the next move?
  145. - bool stop = false;
  146. -
  147. // go back and move the nf cells to the hole
  148. for (; i1 < n_kv; ++i1) {
  149. auto & cell1 = kv_self.cells[i1];
  150. if (cell1.is_empty() || ids[i1] != n_kv) {
  151. - if (n_moves == max_moves) {
  152. - stop = true;
  153. - break;
  154. - }
  155. -
  156. cont = false;
  157. continue;
  158. }
  159. @@ -9524,8 +9514,10 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
  160. kv_self.head = n_used;
  161. if (!cont) {
  162. - n_moves++;
  163. + moves.push_back({i1, i0 + nf, 1});
  164. cont = true;
  165. + } else {
  166. + moves.back().len++;
  167. }
  168. nf++;
  169. @@ -9535,22 +9527,16 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
  170. }
  171. }
  172. - if (stop || n_moves == max_moves) {
  173. - break;
  174. - }
  175. -
  176. //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
  177. i0 += nh - 1;
  178. }
  179. - if (n_moves == 0) {
  180. + if (moves.size() == 0) {
  181. return;
  182. }
  183. - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
  184. -
  185. - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
  186. + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
  187. #if 0
  188. // CPU defrag
  189. @@ -9625,11 +9611,18 @@ static void llama_kv_cache_defrag_impl(struct llama_context & lctx) {
  190. #else
  191. // ggml_graph defrag
  192. - ggml_backend_sched_reset(lctx.sched.get());
  193. + for (std::size_t i = 0; i < moves.size(); i += max_moves) {
  194. + std::vector<struct llama_kv_defrag_move> chunk;
  195. + auto end = std::min(i + max_moves, moves.size());
  196. + chunk.assign(moves.begin() + i, moves.begin() + end);
  197. - ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
  198. + ggml_backend_sched_reset(lctx.sched.get());
  199. +
  200. + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
  201. + ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
  202. - llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  203. + llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  204. + }
  205. #endif
  206. //const int64_t t_end = ggml_time_us();