llama-adapter.h 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #pragma once
  2. #include "llama.h"
  3. #include "ggml-cpp.h"
  4. #include <string>
  5. #include <unordered_map>
  6. #include <vector>
  7. // TODO: pimpl
  8. //
  9. // llama_adapter_cvec
  10. //
  11. struct llama_adapter_cvec {
  12. struct ggml_tensor * tensor_for(int il) const;
  13. struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
  14. int32_t apply(
  15. const llama_model & model,
  16. const float * data,
  17. size_t len,
  18. int32_t n_embd,
  19. int32_t il_start,
  20. int32_t il_end);
  21. private:
  22. bool init(const llama_model & model);
  23. int32_t layer_start = -1;
  24. int32_t layer_end = -1;
  25. std::vector<ggml_context_ptr> ctxs;
  26. std::vector<ggml_backend_buffer_ptr> bufs;
  27. std::vector<struct ggml_tensor *> tensors; // per layer
  28. };
  29. //
  30. // llama_adapter_lora
  31. //
  32. struct llama_adapter_lora_weight {
  33. struct ggml_tensor * a = nullptr;
  34. struct ggml_tensor * b = nullptr;
  35. // get actual scale based on rank and alpha
  36. float get_scale(float alpha, float adapter_scale) const {
  37. const float rank = (float) b->ne[0];
  38. const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
  39. return scale;
  40. }
  41. llama_adapter_lora_weight() = default;
  42. llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
  43. };
  44. struct llama_adapter_lora {
  45. // map tensor name to lora_a_b
  46. std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
  47. std::vector<ggml_context_ptr> ctxs;
  48. std::vector<ggml_backend_buffer_ptr> bufs;
  49. float alpha;
  50. llama_adapter_lora() = default;
  51. ~llama_adapter_lora() = default;
  52. llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
  53. };