llama-adapter.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. #include "llama-adapter.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-model.h"
  5. #include <algorithm>
  6. #include <map>
  7. #include <cassert>
  8. #include <stdexcept>
  9. // vec
  10. struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
  11. if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
  12. return nullptr;
  13. }
  14. return tensors[il];
  15. }
  16. struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
  17. ggml_tensor * layer_dir = tensor_for(il);
  18. if (layer_dir != nullptr) {
  19. cur = ggml_add(ctx, cur, layer_dir);
  20. }
  21. return cur;
  22. }
  23. bool llama_adapter_cvec::init(const llama_model & model) {
  24. const auto & hparams = model.hparams;
  25. GGML_ASSERT(tensors.empty());
  26. GGML_ASSERT(ctxs.empty());
  27. GGML_ASSERT(bufs.empty());
  28. // create a context for each buffer type
  29. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  30. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  31. auto it = ctx_map.find(buft);
  32. if (it == ctx_map.end()) {
  33. struct ggml_init_params params = {
  34. /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
  35. /*.mem_buffer =*/ NULL,
  36. /*.no_alloc =*/ true,
  37. };
  38. ggml_context * ctx = ggml_init(params);
  39. if (!ctx) {
  40. return nullptr;
  41. }
  42. ctx_map[buft] = ctx;
  43. ctxs.emplace_back(ctx);
  44. return ctx;
  45. }
  46. return it->second;
  47. };
  48. // make tensors
  49. tensors.reserve(hparams.n_layer);
  50. tensors.push_back(nullptr); // there's never a tensor for layer 0
  51. for (size_t il = 1; il < hparams.n_layer; il++) {
  52. ggml_backend_buffer_type_t buft = model.select_buft(il);
  53. ggml_context * ctx = ctx_for_buft(buft);
  54. if (!ctx) {
  55. LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
  56. return false;
  57. }
  58. ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  59. tensors.push_back(tensor);
  60. }
  61. // allocate tensors / buffers and zero
  62. bufs.reserve(ctx_map.size());
  63. for (auto it : ctx_map) {
  64. ggml_backend_buffer_type_t buft = it.first;
  65. ggml_context * ctx = it.second;
  66. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  67. if (!buf) {
  68. LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
  69. return false;
  70. }
  71. ggml_backend_buffer_clear(buf, 0);
  72. bufs.emplace_back(buf);
  73. }
  74. return true;
  75. }
  76. int32_t llama_adapter_cvec::apply(
  77. const llama_model & model,
  78. const float * data,
  79. size_t len,
  80. int32_t n_embd,
  81. int32_t il_start,
  82. int32_t il_end) {
  83. const auto & hparams = model.hparams;
  84. if (data == nullptr) {
  85. // disable the current control vector (but leave allocated for later)
  86. layer_start = -1;
  87. layer_end = -1;
  88. return 0;
  89. }
  90. if (n_embd != (int) hparams.n_embd) {
  91. LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
  92. return 1;
  93. }
  94. if (tensors.empty()) {
  95. if (!init(model)) {
  96. return 1;
  97. }
  98. }
  99. layer_start = il_start;
  100. layer_end = il_end;
  101. for (size_t il = 1; il < hparams.n_layer; il++) {
  102. assert(tensors[il] != nullptr);
  103. const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
  104. if (off + n_embd <= len) {
  105. ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
  106. }
  107. }
  108. return 0;
  109. }
  110. // lora
  111. llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
  112. const std::string name(w->name);
  113. const auto pos = ab_map.find(name);
  114. if (pos != ab_map.end()) {
  115. return &pos->second;
  116. }
  117. return nullptr;
  118. }
  119. static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
  120. LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  121. ggml_context * ctx_init;
  122. struct gguf_init_params meta_gguf_params = {
  123. /* .no_alloc = */ true,
  124. /* .ctx = */ &ctx_init,
  125. };
  126. gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
  127. if (!ctx_gguf) {
  128. throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
  129. }
  130. ggml_context_ptr ctx { ctx_init };
  131. // check metadata
  132. {
  133. auto get_kv_str = [&](const std::string & key) -> std::string {
  134. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  135. return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
  136. };
  137. auto get_kv_f32 = [&](const std::string & key) -> float {
  138. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  139. return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
  140. };
  141. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  142. auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
  143. if (general_type != "adapter") {
  144. throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
  145. }
  146. auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
  147. auto general_arch = llm_arch_from_string(general_arch_str);
  148. if (general_arch != model.arch) {
  149. throw std::runtime_error("model arch and LoRA arch mismatch");
  150. }
  151. auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
  152. if (adapter_type != "lora") {
  153. throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
  154. }
  155. adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
  156. }
  157. int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  158. // contexts for each buffer type
  159. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  160. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  161. auto it = ctx_map.find(buft);
  162. if (it == ctx_map.end()) {
  163. // add a new context
  164. struct ggml_init_params params = {
  165. /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
  166. /*.mem_buffer =*/ NULL,
  167. /*.no_alloc =*/ true,
  168. };
  169. ggml_context * buft_ctx = ggml_init(params);
  170. if (!buft_ctx) {
  171. return nullptr;
  172. }
  173. ctx_map[buft] = buft_ctx;
  174. adapter.ctxs.emplace_back(buft_ctx);
  175. return buft_ctx;
  176. };
  177. return it->second;
  178. };
  179. // bundle lora_a and lora_b into pairs
  180. std::map<std::string, llama_adapter_lora_weight> ab_map;
  181. auto str_endswith = [](const std::string & str, const std::string & suffix) {
  182. return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
  183. };
  184. for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  185. std::string name(cur->name);
  186. if (str_endswith(name, ".lora_a")) {
  187. replace_all(name, ".lora_a", "");
  188. if (ab_map.find(name) == ab_map.end()) {
  189. ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
  190. } else {
  191. ab_map[name].a = cur;
  192. }
  193. } else if (str_endswith(name, ".lora_b")) {
  194. replace_all(name, ".lora_b", "");
  195. if (ab_map.find(name) == ab_map.end()) {
  196. ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
  197. } else {
  198. ab_map[name].b = cur;
  199. }
  200. } else if (str_endswith(name, "_norm.weight")) {
  201. // TODO: add support for norm vector
  202. // for now, we don't really care because most adapters still work fine without it
  203. continue;
  204. } else {
  205. throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
  206. }
  207. }
  208. // add tensors
  209. for (auto & it : ab_map) {
  210. const std::string & name = it.first;
  211. llama_adapter_lora_weight & w = it.second;
  212. bool is_token_embd = str_endswith(name, "token_embd.weight");
  213. if (!w.a || !w.b) {
  214. throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
  215. }
  216. // device buft and device ctx
  217. const auto * model_tensor = model.get_tensor(name.c_str());
  218. if (!model_tensor) {
  219. throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
  220. }
  221. struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
  222. // validate tensor shape
  223. if (is_token_embd) {
  224. // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
  225. if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
  226. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  227. }
  228. } else {
  229. if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
  230. throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
  231. }
  232. if (w.a->ne[1] != w.b->ne[0]) {
  233. throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
  234. }
  235. }
  236. // save tensor to adapter
  237. struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
  238. struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
  239. ggml_set_name(tensor_a, w.a->name);
  240. ggml_set_name(tensor_b, w.b->name);
  241. adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
  242. }
  243. // allocate tensors / buffers and zero
  244. {
  245. adapter.ctxs.reserve(ctx_map.size());
  246. adapter.bufs.reserve(ctx_map.size());
  247. for (auto & it : ctx_map) {
  248. ggml_backend_buffer_type_t buft = it.first;
  249. ggml_context * ctx_dev = it.second;
  250. ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
  251. if (!buf) {
  252. throw std::runtime_error("failed to allocate buffer for lora adapter\n");
  253. }
  254. LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
  255. adapter.bufs.emplace_back(std::move(buf));
  256. }
  257. }
  258. // set tensor data
  259. {
  260. llama_file gguf_file(path_lora, "rb");
  261. std::vector<uint8_t> read_buf;
  262. auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
  263. size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
  264. size_t size = ggml_nbytes(orig);
  265. read_buf.resize(size);
  266. gguf_file.seek(offs, SEEK_SET);
  267. gguf_file.read_raw(read_buf.data(), size);
  268. ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
  269. };
  270. for (auto & it : adapter.ab_map) {
  271. auto orig = ab_map[it.first];
  272. auto dev = it.second;
  273. set_tensor(orig.a, dev.a);
  274. set_tensor(orig.b, dev.b);
  275. }
  276. }
  277. LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  278. }
  279. struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
  280. struct llama_adapter_lora * adapter = new llama_adapter_lora();
  281. try {
  282. llama_adapter_lora_init_impl(*model, path_lora, *adapter);
  283. return adapter;
  284. } catch (const std::exception & err) {
  285. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  286. delete adapter;
  287. }
  288. return nullptr;
  289. }
  290. void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
  291. delete adapter;
  292. }