09-lora.diff 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. diff --git a/common/common.cpp b/common/common.cpp
  2. index dbb724fb..c26fe6ee 100644
  3. --- a/common/common.cpp
  4. +++ b/common/common.cpp
  5. @@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
  6. for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
  7. const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
  8. float lora_scale = std::get<1>(params.lora_adapter[i]);
  9. +
  10. + // try to load as gguf
  11. auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
  12. if (adapter == nullptr) {
  13. - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
  14. - llama_free(lctx);
  15. - llama_free_model(model);
  16. - return std::make_tuple(nullptr, nullptr);
  17. + fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
  18. +
  19. + // if that fails, try loading as ggla for compatibility
  20. + int err = llama_model_apply_lora_from_file(model,
  21. + lora_adapter.c_str(),
  22. + lora_scale,
  23. + ((i > 0) || params.lora_base.empty())
  24. + ? NULL
  25. + : params.lora_base.c_str(),
  26. + params.n_threads);
  27. + if (err != 0) {
  28. + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
  29. + llama_free(lctx);
  30. + llama_free_model(model);
  31. + return std::make_tuple(nullptr, nullptr);
  32. + }
  33. + } else {
  34. + llama_lora_adapter_set(lctx, adapter, lora_scale);
  35. }
  36. - llama_lora_adapter_set(lctx, adapter, lora_scale);
  37. }
  38. if (params.ignore_eos) {
  39. diff --git a/include/llama.h b/include/llama.h
  40. index 93fd77ca..b0fb37a6 100644
  41. --- a/include/llama.h
  42. +++ b/include/llama.h
  43. @@ -1160,6 +1160,20 @@ extern "C" {
  44. LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
  45. + // Apply a LoRA adapter to a loaded model
  46. + // path_base_model is the path to a higher quality model to use as a base for
  47. + // the layers modified by the adapter. Can be NULL to use the current loaded model.
  48. + // The model needs to be reloaded before applying a new adapter, otherwise the adapter
  49. + // will be applied on top of the previous one
  50. + // Returns 0 on success
  51. + LLAMA_API int32_t llama_model_apply_lora_from_file(
  52. + const struct llama_model * model,
  53. + const char * path_lora,
  54. + float scale,
  55. + const char * path_base_model,
  56. + int32_t n_threads);
  57. +
  58. +
  59. #ifdef __cplusplus
  60. }
  61. #endif
  62. diff --git a/src/llama.cpp b/src/llama.cpp
  63. index 80a0dd0f..9d7b0e17 100644
  64. --- a/src/llama.cpp
  65. +++ b/src/llama.cpp
  66. @@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
  67. fputs(text, stderr);
  68. fflush(stderr);
  69. }
  70. +
  71. +static int llama_apply_lora_from_file_internal(
  72. + const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
  73. +) {
  74. + LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
  75. +
  76. + const int64_t t_start_lora_us = ggml_time_us();
  77. +
  78. + llama_file fin(path_lora, "rb");
  79. +
  80. + // verify magic and version
  81. + {
  82. + uint32_t magic = fin.read_u32();
  83. + if (magic != LLAMA_FILE_MAGIC_GGLA) {
  84. + LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
  85. + return 1;
  86. + }
  87. +
  88. + uint32_t format_version = fin.read_u32();
  89. + if (format_version != 1) {
  90. + LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
  91. + return 1;
  92. + }
  93. + }
  94. +
  95. + int32_t lora_r = fin.read_u32();
  96. + int32_t lora_alpha = fin.read_u32();
  97. + float scaling = scale * (float)lora_alpha / (float)lora_r;
  98. +
  99. + LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
  100. +
  101. + // load base model
  102. + std::unique_ptr<llama_model_loader> ml;
  103. + if (path_base_model) {
  104. + LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
  105. + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
  106. + ml->init_mappings(/*prefetch*/ false); // no prefetching
  107. + }
  108. +
  109. + struct tensor_meta {
  110. + std::string name;
  111. + ggml_type type;
  112. + int32_t ne[2];
  113. + size_t offset;
  114. + };
  115. + std::map<std::string, tensor_meta> tensor_meta_map;
  116. +
  117. + // load all tensor meta
  118. + while (true) {
  119. + if (fin.tell() == fin.size) {
  120. + // eof
  121. + break;
  122. + }
  123. +
  124. + int32_t n_dims;
  125. + int32_t name_len;
  126. + int32_t ftype;
  127. +
  128. + fin.read_raw(&n_dims, sizeof(n_dims));
  129. + fin.read_raw(&name_len, sizeof(name_len));
  130. + fin.read_raw(&ftype, sizeof(ftype));
  131. +
  132. + if (n_dims != 1 && n_dims != 2) {
  133. + LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
  134. + return 1;
  135. + }
  136. +
  137. + int32_t ne[2] = { 1, 1 };
  138. + for (int i = 0; i < n_dims; ++i) {
  139. + fin.read_raw(&ne[i], sizeof(ne[i]));
  140. + }
  141. +
  142. + std::string name;
  143. + {
  144. + GGML_ASSERT(name_len < GGML_MAX_NAME);
  145. + char buf[GGML_MAX_NAME];
  146. + fin.read_raw(buf, name_len);
  147. + name = std::string(buf, name_len);
  148. + }
  149. +
  150. + // check for lora suffix
  151. + std::string lora_suffix;
  152. + if (name.length() > 6) {
  153. + lora_suffix = name.substr(name.length() - 6);
  154. + }
  155. + if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
  156. + LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
  157. + return 1;
  158. + }
  159. +
  160. + // tensor type
  161. + ggml_type wtype;
  162. + switch (ftype) {
  163. + case 0: wtype = GGML_TYPE_F32; break;
  164. + case 1: wtype = GGML_TYPE_F16; break;
  165. + default:
  166. + {
  167. + LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
  168. + __func__, ftype);
  169. + return 1;
  170. + }
  171. + }
  172. +
  173. + // data offset
  174. + size_t offset = fin.tell();
  175. + offset = (offset + 31) & -32;
  176. +
  177. + // skip tensor data
  178. + fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
  179. +
  180. + tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
  181. + }
  182. +
  183. + bool warned = false;
  184. + int n_tensors = 0;
  185. +
  186. + // apply
  187. + ggml_backend_t backend_cpu = ggml_backend_cpu_init();
  188. + if (backend_cpu == nullptr) {
  189. + LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
  190. + return 1;
  191. + }
  192. + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
  193. +
  194. + std::vector<no_init<uint8_t>> read_buf;
  195. + for (const auto & it : model.tensors_by_name) {
  196. + const std::string & base_name = it.first;
  197. + ggml_tensor * model_t = it.second;
  198. +
  199. + if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
  200. + tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
  201. + continue;
  202. + }
  203. +
  204. + tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
  205. + tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
  206. +
  207. + ggml_init_params lora_init_params = {
  208. + /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
  209. + /* .mem_buffer */ nullptr,
  210. + /* .no_alloc */ true,
  211. + };
  212. + ggml_context * lora_ctx = ggml_init(lora_init_params);
  213. + if (lora_ctx == nullptr) {
  214. + LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
  215. + ggml_backend_free(backend_cpu);
  216. + return 1;
  217. + }
  218. +
  219. + // create tensors
  220. + ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
  221. + ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
  222. + ggml_set_name(loraA, metaA.name.c_str());
  223. + ggml_set_name(loraB, metaB.name.c_str());
  224. +
  225. + ggml_tensor * base_t;
  226. + if (ml) {
  227. + if (!ml->get_tensor_meta(base_name.c_str())) {
  228. + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
  229. + return 1;
  230. + }
  231. + base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
  232. + } else {
  233. + base_t = ggml_dup_tensor(lora_ctx, model_t);
  234. + }
  235. + ggml_set_name(base_t, base_name.c_str());
  236. +
  237. + // allocate in backend buffer
  238. + ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
  239. + if (lora_buf == nullptr) {
  240. + LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
  241. + return 1;
  242. + }
  243. +
  244. + // load tensor data
  245. + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
  246. + read_buf.resize(ggml_nbytes(tensor));
  247. + fin.seek(tensor_meta.offset, SEEK_SET);
  248. + fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
  249. + ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
  250. + };
  251. + load_tensor(metaA, loraA);
  252. + load_tensor(metaB, loraB);
  253. +
  254. + // load base model tensor data
  255. + if (ml) {
  256. + ml->load_data_for(base_t);
  257. + } else {
  258. + ggml_backend_tensor_copy(model_t, base_t);
  259. + }
  260. +
  261. + if (ggml_is_quantized(base_t->type) && !warned) {
  262. + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
  263. + "use a f16 or f32 base model with --lora-base\n", __func__);
  264. + warned = true;
  265. + }
  266. +
  267. + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
  268. + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
  269. + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
  270. + ggml_free(lora_ctx);
  271. + ggml_backend_buffer_free(lora_buf);
  272. + ggml_backend_free(backend_cpu);
  273. + return 1;
  274. + }
  275. +
  276. + auto build_lora_graph = [&]() {
  277. + // w = w + BA*s
  278. + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
  279. + ggml_set_name(BA, "BA");
  280. +
  281. + if (scaling != 1.0f) {
  282. + BA = ggml_scale(lora_ctx, BA, scaling);
  283. + ggml_set_name(BA, "BA_scaled");
  284. + }
  285. +
  286. + ggml_tensor * r;
  287. + r = ggml_add_inplace(lora_ctx, base_t, BA);
  288. + ggml_set_name(r, "r_add");
  289. +
  290. + if (base_t->type != model_t->type) {
  291. + // convert the result to the model type
  292. + r = ggml_cast(lora_ctx, r, model_t->type);
  293. + ggml_set_name(r, "r_cast");
  294. + }
  295. +
  296. + return r;
  297. + };
  298. +
  299. + ggml_cgraph * gf = ggml_new_graph(lora_ctx);
  300. + ggml_tensor * r = build_lora_graph();
  301. + ggml_build_forward_expand(gf, r);
  302. +
  303. + ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
  304. + if (graph_buf == nullptr) {
  305. + LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
  306. + ggml_free(lora_ctx);
  307. + ggml_backend_buffer_free(lora_buf);
  308. + ggml_backend_free(backend_cpu);
  309. + return 1;
  310. + }
  311. +
  312. + ggml_backend_graph_compute(backend_cpu, gf);
  313. +
  314. + ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
  315. +
  316. +#if 0
  317. + // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
  318. + //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
  319. +
  320. + // sched compute
  321. + ggml_build_forward_expand(gf, build_graph());
  322. + ggml_backend_sched_init_measure(sched, gf);
  323. +
  324. + // create the graph again, since the previous one was destroyed by the measure
  325. + ggml_graph_clear(gf);
  326. + ggml_build_forward_expand(gf, build_graph());
  327. + ggml_backend_sched_graph_compute(sched, gf);
  328. + ggml_backend_sched_free(sched);
  329. +#endif
  330. +
  331. + ggml_backend_buffer_free(lora_buf);
  332. + ggml_backend_buffer_free(graph_buf);
  333. + ggml_free(lora_ctx);
  334. +
  335. + n_tensors++;
  336. + if (n_tensors % 4 == 0) {
  337. + LLAMA_LOG_INFO(".");
  338. + }
  339. + }
  340. +
  341. + ggml_backend_free(backend_cpu);
  342. +
  343. + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
  344. + LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
  345. +
  346. + return 0;
  347. +}
  348. +
  349. +int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
  350. + try {
  351. + return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
  352. + } catch (const std::exception & err) {
  353. + LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  354. + return 1;
  355. + }
  356. +}
  357. \ No newline at end of file