llama-adapter.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /**
  2. * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "llama-adapter.h"
  27. #include "llama-model.h"
  28. #include <algorithm>
  29. #include <map>
  30. #include <cassert>
  31. #include <stdexcept>
  32. // vec
  33. struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
  34. if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
  35. return nullptr;
  36. }
  37. return tensors[il];
  38. }
  39. struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
  40. ggml_tensor * layer_dir = tensor_for(il);
  41. if (layer_dir != nullptr) {
  42. cur = ggml_add(ctx, cur, layer_dir);
  43. }
  44. return cur;
  45. }
  46. static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
  47. const auto & hparams = model.hparams;
  48. GGML_ASSERT(cvec.tensors.empty());
  49. GGML_ASSERT(cvec.ctxs.empty());
  50. GGML_ASSERT(cvec.bufs.empty());
  51. // create a context for each buffer type
  52. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  53. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  54. auto it = ctx_map.find(buft);
  55. if (it == ctx_map.end()) {
  56. struct ggml_init_params params = {
  57. /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
  58. /*.mem_buffer =*/ NULL,
  59. /*.no_alloc =*/ true,
  60. };
  61. ggml_context * ctx = ggml_init(params);
  62. if (!ctx) {
  63. return nullptr;
  64. }
  65. ctx_map[buft] = ctx;
  66. cvec.ctxs.emplace_back(ctx);
  67. return ctx;
  68. }
  69. return it->second;
  70. };
  71. // make tensors
  72. cvec.tensors.reserve(hparams.n_layer);
  73. cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
  74. for (size_t il = 1; il < hparams.n_layer; il++) {
  75. ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
  76. ggml_context * ctx = ctx_for_buft(buft);
  77. if (!ctx) {
  78. LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
  79. return false;
  80. }
  81. ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  82. cvec.tensors.push_back(tensor);
  83. }
  84. // allocate tensors / buffers and zero
  85. cvec.bufs.reserve(ctx_map.size());
  86. for (auto it : ctx_map) {
  87. ggml_backend_buffer_type_t buft = it.first;
  88. ggml_context * ctx = it.second;
  89. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  90. if (!buf) {
  91. LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
  92. return false;
  93. }
  94. ggml_backend_buffer_clear(buf, 0);
  95. cvec.bufs.emplace_back(buf);
  96. }
  97. return true;
  98. }
  99. int32_t llama_control_vector_apply(
  100. struct llama_control_vector & cvec,
  101. const llama_model & model,
  102. const float * data,
  103. size_t len,
  104. int32_t n_embd,
  105. int32_t il_start,
  106. int32_t il_end) {
  107. const auto & hparams = model.hparams;
  108. if (data == nullptr) {
  109. // disable the current control vector (but leave allocated for later)
  110. cvec.layer_start = -1;
  111. cvec.layer_end = -1;
  112. return 0;
  113. }
  114. if (n_embd != (int) hparams.n_embd) {
  115. LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
  116. return 1;
  117. }
  118. if (cvec.tensors.empty()) {
  119. if (!llama_control_vector_init(cvec, model)) {
  120. return 1;
  121. }
  122. }
  123. cvec.layer_start = il_start;
  124. cvec.layer_end = il_end;
  125. for (size_t il = 1; il < hparams.n_layer; il++) {
  126. assert(cvec.tensors[il] != nullptr);
  127. const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
  128. if (off + n_embd <= len) {
  129. ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
  130. }
  131. }
  132. return 0;
  133. }
  134. // lora
  135. llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
  136. const std::string name(w->name);
  137. const auto pos = ab_map.find(name);
  138. if (pos != ab_map.end()) {
  139. return &pos->second;
  140. }
  141. return nullptr;
  142. }
  143. void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
  144. delete adapter;
  145. }
  146. static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
  147. LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  148. ggml_context * ctx_init;
  149. struct gguf_init_params meta_gguf_params = {
  150. /* .no_alloc = */ true,
  151. /* .ctx = */ &ctx_init,
  152. };
  153. gguf_context_ptr ctx_gguf { gguf_init_from_file(path_lora, meta_gguf_params) };
  154. if (!ctx_gguf) {
  155. throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
  156. }
  157. ggml_context_ptr ctx { ctx_init };
  158. // check metadata
  159. {
  160. auto get_kv_str = [&](const std::string & key) -> std::string {
  161. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  162. return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf.get(), id));
  163. };
  164. auto get_kv_f32 = [&](const std::string & key) -> float {
  165. int id = gguf_find_key(ctx_gguf.get(), key.c_str());
  166. return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf.get(), id);
  167. };
  168. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  169. auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
  170. if (general_type != "adapter") {
  171. throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
  172. }
  173. auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
  174. auto general_arch = llm_arch_from_string(general_arch_str);
  175. if (general_arch != model.arch) {
  176. throw std::runtime_error("model arch and LoRA arch mismatch");
  177. }
  178. auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
  179. if (adapter_type != "lora") {
  180. throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
  181. }
  182. adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
  183. }
  184. int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
  185. // contexts for each buffer type
  186. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  187. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  188. auto it = ctx_map.find(buft);
  189. if (it == ctx_map.end()) {
  190. // add a new context
  191. struct ggml_init_params params = {
  192. /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
  193. /*.mem_buffer =*/ NULL,
  194. /*.no_alloc =*/ true,
  195. };
  196. ggml_context * buft_ctx = ggml_init(params);
  197. if (!buft_ctx) {
  198. return nullptr;
  199. }
  200. ctx_map[buft] = buft_ctx;
  201. adapter.ctxs.emplace_back(buft_ctx);
  202. return buft_ctx;
  203. };
  204. return it->second;
  205. };
  206. // bundle lora_a and lora_b into pairs
  207. std::map<std::string, llama_lora_weight> ab_map;
  208. auto str_endswith = [](const std::string & str, const std::string & suffix) {
  209. return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
  210. };
  211. for (ggml_tensor * cur = ggml_get_first_tensor(ctx.get()); cur; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  212. std::string name(cur->name);
  213. if (str_endswith(name, ".lora_a")) {
  214. replace_all(name, ".lora_a", "");
  215. if (ab_map.find(name) == ab_map.end()) {
  216. ab_map[name] = llama_lora_weight(cur, nullptr);
  217. } else {
  218. ab_map[name].a = cur;
  219. }
  220. } else if (str_endswith(name, ".lora_b")) {
  221. replace_all(name, ".lora_b", "");
  222. if (ab_map.find(name) == ab_map.end()) {
  223. ab_map[name] = llama_lora_weight(nullptr, cur);
  224. } else {
  225. ab_map[name].b = cur;
  226. }
  227. } else {
  228. throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
  229. }
  230. }
  231. // add tensors
  232. for (auto & it : ab_map) {
  233. const std::string & name = it.first;
  234. llama_lora_weight & w = it.second;
  235. if (!w.a || !w.b) {
  236. throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
  237. }
  238. // device buft and device ctx
  239. auto * model_tensor = llama_model_get_tensor(model, name.c_str());
  240. if (!model_tensor) {
  241. throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
  242. }
  243. struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
  244. // validate tensor shape
  245. if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
  246. throw std::runtime_error("tensor '" + name + "' has incorrect shape");
  247. }
  248. if (w.a->ne[1] != w.b->ne[0]) {
  249. throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
  250. }
  251. // save tensor to adapter
  252. struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
  253. struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
  254. ggml_set_name(tensor_a, w.a->name);
  255. ggml_set_name(tensor_b, w.b->name);
  256. adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
  257. }
  258. // allocate tensors / buffers and zero
  259. {
  260. adapter.ctxs.reserve(ctx_map.size());
  261. adapter.bufs.reserve(ctx_map.size());
  262. for (auto & it : ctx_map) {
  263. ggml_backend_buffer_type_t buft = it.first;
  264. ggml_context * ctx_dev = it.second;
  265. ggml_backend_buffer_ptr buf { ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft) };
  266. if (!buf) {
  267. throw std::runtime_error("failed to allocate buffer for lora adapter\n");
  268. }
  269. LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get())/1024.0/1024.0);
  270. adapter.bufs.emplace_back(std::move(buf));
  271. }
  272. }
  273. // set tensor data
  274. {
  275. llama_file gguf_file(path_lora, "rb");
  276. std::vector<uint8_t> read_buf;
  277. auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
  278. size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
  279. size_t size = ggml_nbytes(orig);
  280. read_buf.resize(size);
  281. gguf_file.seek(offs, SEEK_SET);
  282. gguf_file.read_raw(read_buf.data(), size);
  283. ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
  284. };
  285. for (auto & it : adapter.ab_map) {
  286. auto orig = ab_map[it.first];
  287. auto dev = it.second;
  288. set_tensor(orig.a, dev.a);
  289. set_tensor(orig.b, dev.b);
  290. }
  291. }
  292. LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  293. }
  294. struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
  295. struct llama_lora_adapter * adapter = new llama_lora_adapter();
  296. try {
  297. llama_lora_adapter_init_impl(*model, path_lora, *adapter);
  298. return adapter;
  299. } catch (const std::exception & err) {
  300. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  301. delete adapter;
  302. }
  303. return nullptr;
  304. }