01-cache.diff 1.5 KB

123456789101112131415161718192021222324252627282930
  1. diff --git a/examples/server/server.cpp b/examples/server/server.cpp
  2. index d86d7e04..7d71c766 100644
  3. --- a/examples/server/server.cpp
  4. +++ b/examples/server/server.cpp
  5. @@ -1598,12 +1598,6 @@ struct llama_server_context
  6. LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
  7. }
  8. - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
  9. -
  10. - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
  11. -
  12. - slot.cache_tokens = prompt_tokens;
  13. -
  14. if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
  15. {
  16. // we have to evaluate at least 1 token to generate logits.
  17. @@ -1615,6 +1609,12 @@ struct llama_server_context
  18. }
  19. }
  20. + LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
  21. +
  22. + llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
  23. +
  24. + slot.cache_tokens = prompt_tokens;
  25. +
  26. LOG_VERBOSE("prompt ingested", {
  27. {"n_past", slot.n_past},
  28. {"cached", tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},