llama.cpp 557 KB


  1. /**
  2. * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "llama-impl.h"
  27. #include "llama-chat.h"
  28. #include "llama-mmap.h"
  29. #include "llama-context.h"
  30. #include "llama-vocab.h"
  31. #include "llama-sampling.h"
  32. #include "llama-kv-cache.h"
  33. #include "llama-model-loader.h"
  34. #include "llama-model.h"
  35. #include "llama-quant.h"
  36. #include "ggml.h"
  37. #include "ggml-alloc.h"
  38. #include "ggml-backend.h"
  39. #include "ggml-cpp.h"
  40. #include <algorithm>
  41. #include <array>
  42. #include <cassert>
  43. #include <cctype>
  44. #include <cfloat>
  45. #include <cinttypes>
  46. #include <climits>
  47. #include <cmath>
  48. #include <cstdarg>
  49. #include <cstddef>
  50. #include <cstdint>
  51. #include <cstdio>
  52. #include <cstring>
  53. #include <ctime>
  54. #include <functional>
  55. #include <initializer_list>
  56. #include <locale>
  57. #include <map>
  58. #include <numeric>
  59. #include <type_traits>
  60. #if defined(_MSC_VER)
  61. #pragma warning(disable: 4244 4267) // possible loss of data
  62. #endif
  63. //
  64. // tensor loading (TODO: add llama_tesor_loader?)
  65. //
  66. static int llama_get_device_count(const llama_model & model) {
  67. return (int) model.devices.size();
  68. }
  69. // checks if the weight tensor can be used with the specified buffer type and device
  70. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  71. GGML_ASSERT(w != nullptr);
  72. if (op == GGML_OP_NONE) {
  73. return true;
  74. }
  75. ggml_init_params params = {
  76. /*.mem_size =*/ ggml_tensor_overhead()*8,
  77. /*.mem_buffer =*/ NULL,
  78. /*.no_alloc =*/ true,
  79. };
  80. ggml_context_ptr ctx_ptr { ggml_init(params) };
  81. if (!ctx_ptr) {
  82. throw std::runtime_error(format("failed to create ggml context"));
  83. }
  84. ggml_context * ctx = ctx_ptr.get();
  85. ggml_tensor * op_tensor = nullptr;
  86. switch (op) {
  87. case GGML_OP_GET_ROWS:
  88. {
  89. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  90. op_tensor = ggml_get_rows(ctx, w, b);
  91. } break;
  92. case GGML_OP_MUL_MAT:
  93. {
  94. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  95. op_tensor = ggml_mul_mat(ctx, w, b);
  96. } break;
  97. case GGML_OP_MUL_MAT_ID:
  98. {
  99. int n_expert_used = hparams.n_expert_used;
  100. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  101. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  102. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  103. } break;
  104. case GGML_OP_ADD:
  105. {
  106. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  107. op_tensor = ggml_add(ctx, a, w);
  108. } break;
  109. case GGML_OP_MUL:
  110. {
  111. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  112. op_tensor = ggml_mul(ctx, a, w);
  113. } break;
  114. case GGML_OP_DIV:
  115. {
  116. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  117. op_tensor = ggml_div(ctx, a, w);
  118. } break;
  119. case GGML_OP_ROPE:
  120. {
  121. int n_embd_head = hparams.n_embd_head_v;
  122. int n_head = hparams.n_head();
  123. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  124. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  125. op_tensor = ggml_rope_ext(
  126. ctx, a, b, w,
  127. 0, 0, 0, 0, 0,
  128. 0, 0, 0, 0
  129. );
  130. } break;
  131. case GGML_OP_SSM_CONV:
  132. {
  133. // FIXME
  134. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
  135. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  136. } break;
  137. case GGML_OP_SSM_SCAN:
  138. {
  139. // FIXME
  140. const int64_t d_state = w->ne[0];
  141. const int64_t d_inner = w->ne[1];
  142. const int64_t n_seq_tokens = 512;
  143. const int64_t n_seqs = 1;
  144. ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
  145. ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  146. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  147. ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  148. ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  149. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
  150. } break;
  151. case GGML_OP_RWKV_WKV6:
  152. {
  153. // FIXME
  154. const int64_t S = 123;
  155. const int64_t H = 123;
  156. const int64_t n_tokens = 123;
  157. const int64_t n_seqs = 123;
  158. ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
  159. ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
  160. ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
  161. ggml_tensor * tf = w;
  162. ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
  163. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  164. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  165. } break;
  166. case GGML_OP_IM2COL:
  167. {
  168. const int n_embd = hparams.n_embd;
  169. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  170. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  171. } break;
  172. default:
  173. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  174. }
  175. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  176. GGML_ASSERT(w->buffer == nullptr);
  177. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  178. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  179. ggml_backend_buffer_free(w->buffer);
  180. w->buffer = nullptr;
  181. return op_supported;
  182. }
  183. // find the first buffer type in the list that can use the tensor
  184. static ggml_backend_buffer_type_t select_weight_buft(const llama_model & model, ggml_tensor * tensor, ggml_op op, const llama_model::buft_list_t & buft_list) {
  185. GGML_ASSERT(!buft_list.empty());
  186. for (const auto & cur : buft_list) {
  187. ggml_backend_dev_t cur_dev = cur.first;
  188. ggml_backend_buffer_type_t cur_buft = cur.second;
  189. if (weight_buft_supported(model.hparams, tensor, op, cur_buft, cur_dev)) {
  190. return cur_buft;
  191. }
  192. }
  193. return nullptr;
  194. }
  195. // CPU: ACCEL -> CPU extra -> GPU host -> CPU
  196. static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
  197. llama_model::buft_list_t buft_list;
  198. // add ACCEL buffer types
  199. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  200. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  201. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  202. auto * buft = ggml_backend_dev_buffer_type(dev);
  203. // skip
  204. if (buft != ggml_backend_cpu_buffer_type()) {
  205. buft_list.emplace_back(dev, buft);
  206. }
  207. }
  208. }
  209. // add extra buffer types
  210. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  211. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  212. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  213. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  214. if (ggml_backend_dev_get_extra_bufts_fn) {
  215. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  216. while (extra_bufts && *extra_bufts) {
  217. buft_list.emplace_back(cpu_dev, *extra_bufts);
  218. ++extra_bufts;
  219. }
  220. }
  221. // add a host buffer type
  222. // storing the tensors in a host buffer is useful when the processing of large batches
  223. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  224. // generally, this will be done using the first device in the list
  225. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  226. // function of the device to determine if it would benefit from being stored in a host buffer
  227. for (auto * dev : model.devices) {
  228. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  229. if (buft) {
  230. buft_list.emplace_back(dev, buft);
  231. break;
  232. }
  233. }
  234. // add the CPU buffer type
  235. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  236. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  237. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  238. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  239. }
  240. }
  241. return buft_list;
  242. }
  243. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  244. static llama_model::buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
  245. llama_model::buft_list_t buft_list;
  246. // add the device split buffer type if requested and available
  247. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  248. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  249. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  250. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  251. if (ggml_backend_split_buffer_type_fn) {
  252. size_t dev_index = [&]() {
  253. auto * reg = ggml_backend_dev_backend_reg(dev);
  254. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  255. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  256. return i;
  257. }
  258. }
  259. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  260. }();
  261. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  262. if (buft != nullptr) {
  263. buft_list.emplace_back(dev, buft);
  264. }
  265. }
  266. }
  267. // add the device default buffer type
  268. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  269. return buft_list;
  270. }
  271. // Returns false if cancelled by progress_callback
  272. static bool llm_load_tensors(
  273. llama_model_loader & ml,
  274. llama_model & model,
  275. int n_gpu_layers,
  276. enum llama_split_mode split_mode,
  277. int main_gpu,
  278. const float * tensor_split,
  279. bool use_mlock,
  280. llama_progress_callback progress_callback,
  281. void * progress_callback_user_data) {
  282. auto & hparams = model.hparams;
  283. model.split_mode = split_mode;
  284. model.main_gpu = main_gpu;
  285. model.n_gpu_layers = n_gpu_layers;
  286. const int n_layer = hparams.n_layer;
  287. bool use_mmap_buffer = true;
  288. // build a list of buffer types for the CPU and GPU devices
  289. model.cpu_buft_list = make_cpu_buft_list(model);
  290. for (auto * dev : model.devices) {
  291. llama_model::buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  292. // add CPU buffer types as a fallback
  293. buft_list.insert(buft_list.end(), model.cpu_buft_list.begin(), model.cpu_buft_list.end());
  294. model.gpu_buft_list.emplace(dev, std::move(buft_list));
  295. }
  296. // calculate the split points
  297. int device_count = llama_get_device_count(model);
  298. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
  299. std::vector<float> splits(device_count);
  300. if (all_zero) {
  301. // default split, by free memory
  302. for (int i = 0; i < device_count; ++i) {
  303. ggml_backend_dev_t dev = model.devices[i];
  304. size_t total;
  305. size_t free;
  306. ggml_backend_dev_memory(dev, &free, &total);
  307. splits[i] = free;
  308. }
  309. } else {
  310. std::copy(tensor_split, tensor_split + device_count, splits.begin());
  311. }
  312. // sum and normalize the splits to get the split points
  313. float split_sum = 0.0f;
  314. for (int i = 0; i < device_count; ++i) {
  315. split_sum += splits[i];
  316. splits[i] = split_sum;
  317. }
  318. for (int i = 0; i < device_count; ++i) {
  319. splits[i] /= split_sum;
  320. }
  321. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  322. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  323. const int act_gpu_layers = model.devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  324. auto get_layer_buft_list = [&](int il) -> llama_model::layer_dev {
  325. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  326. return {cpu_dev, &model.cpu_buft_list};
  327. }
  328. int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  329. auto * dev = model.devices.at(layer_gpu);
  330. return {dev, &model.gpu_buft_list.at(dev)};
  331. };
  332. // assign the input layer
  333. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  334. model.dev_input = { cpu_dev, &model.cpu_buft_list };
  335. // assign the repeating layers to the devices according to the splits
  336. model.dev_layer.resize(n_layer);
  337. for (int il = 0; il < n_layer; ++il) {
  338. model.dev_layer[il] = get_layer_buft_list(il);
  339. }
  340. // assign the output layer
  341. model.dev_output = get_layer_buft_list(n_layer);
  342. // one ggml context per buffer type
  343. int max_n_tensors = ml.n_tensors;
  344. max_n_tensors += 1; // duplicated output tensor
  345. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  346. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  347. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  348. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  349. auto it = ctx_map.find(buft);
  350. if (it == ctx_map.end()) {
  351. ggml_init_params params = {
  352. /*.mem_size =*/ ctx_size,
  353. /*.mem_buffer =*/ NULL,
  354. /*.no_alloc =*/ true,
  355. };
  356. ggml_context * ctx = ggml_init(params);
  357. if (!ctx) {
  358. throw std::runtime_error(format("failed to create ggml context"));
  359. }
  360. ctx_map[buft] = ctx;
  361. model.ctxs.emplace_back(ctx);
  362. return ctx;
  363. }
  364. return it->second;
  365. };
  366. // create tensors for the weights
  367. {
  368. // note: cast to int64_t since we will use these for the tensor dimensions
  369. const int64_t n_head = hparams.n_head();
  370. const int64_t n_head_kv = hparams.n_head_kv();
  371. const int64_t n_embd = hparams.n_embd;
  372. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  373. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  374. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  375. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  376. const int64_t n_ff = hparams.n_ff();
  377. const int64_t n_embd_gqa = n_embd_v_gqa;
  378. const int64_t n_vocab = hparams.n_vocab;
  379. const int64_t n_vocab_type = hparams.n_vocab_type;
  380. const int64_t n_rot = hparams.n_rot;
  381. const int64_t n_expert = hparams.n_expert;
  382. const int64_t n_expert_used = hparams.n_expert_used;
  383. const int64_t n_ctx_train = hparams.n_ctx_train;
  384. if (n_expert > 0 && hparams.n_expert_used == 0) {
  385. throw std::runtime_error("model has expert layers but no expert layers are used");
  386. }
  387. int n_moved_tensors = 0;
  388. ggml_tensor * first_moved_tensor = nullptr;
  389. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  390. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  391. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  392. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  393. if (!t_meta) {
  394. if (flags & llama_model_loader::TENSOR_NOT_REQUIRED) {
  395. return nullptr;
  396. }
  397. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  398. }
  399. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  400. // the tensor is duplicated
  401. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  402. llm_tensor tn_tensor = tn.tensor;
  403. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & llama_model_loader::TENSOR_DUPLICATED) {
  404. tn_tensor = LLM_TENSOR_OUTPUT;
  405. }
  406. llm_tensor_info info;
  407. try {
  408. info = llm_tensor_info_for(tn_tensor);
  409. } catch (const std::out_of_range & e) {
  410. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  411. }
  412. // tensors with "bias" suffix are always used with GGML_OP_ADD
  413. ggml_op op;
  414. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  415. if (bias) {
  416. op = GGML_OP_ADD;
  417. } else {
  418. op = info.op;
  419. }
  420. // sanity checks
  421. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  422. if (tn.bid != -1) {
  423. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  424. }
  425. } else {
  426. if (tn.bid == -1) {
  427. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  428. }
  429. }
  430. // select the buffer type for this tensor
  431. llama_model::buft_list_t * buft_list;
  432. switch (info.layer) {
  433. case LLM_TENSOR_LAYER_INPUT:
  434. buft_list = model.dev_input.buft_list;
  435. break;
  436. case LLM_TENSOR_LAYER_OUTPUT:
  437. buft_list = model.dev_output.buft_list;
  438. break;
  439. case LLM_TENSOR_LAYER_REPEATING:
  440. buft_list = model.dev_layer.at(tn.bid).buft_list;
  441. break;
  442. default:
  443. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  444. }
  445. ggml_backend_buffer_type_t buft = select_weight_buft(model, t_meta, op, *buft_list);
  446. if (!buft) {
  447. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  448. }
  449. // avoid using a host buffer when using mmap
  450. auto * buft_dev = ggml_backend_buft_get_device(buft);
  451. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  452. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  453. buft = ggml_backend_dev_buffer_type(cpu_dev);
  454. }
  455. if (buft != buft_list->front().second) {
  456. n_moved_tensors++;
  457. if (!first_moved_tensor) {
  458. first_moved_tensor = t_meta;
  459. first_moved_from_buft = buft_list->front().second;
  460. first_moved_to_buft = buft;
  461. }
  462. }
  463. ggml_context * ctx = ctx_for_buft(buft);
  464. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  465. if (flags & llama_model_loader::TENSOR_DUPLICATED) {
  466. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  467. if (t) {
  468. return t;
  469. }
  470. }
  471. return ml.create_tensor(ctx, tn, ne, flags);
  472. };
  473. model.layers.resize(n_layer);
  474. // TODO: move to a separate function
  475. const auto tn = LLM_TN(model.arch);
  476. switch (model.arch) {
  477. case LLM_ARCH_LLAMA:
  478. case LLM_ARCH_REFACT:
  479. case LLM_ARCH_MINICPM:
  480. case LLM_ARCH_GRANITE:
  481. case LLM_ARCH_GRANITE_MOE:
  482. {
  483. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  484. // output
  485. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  486. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  487. // if output is NULL, init from the input tok embed
  488. if (model.output == NULL) {
  489. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  490. }
  491. for (int i = 0; i < n_layer; ++i) {
  492. auto & layer = model.layers[i];
  493. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  494. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  495. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  496. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  497. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  498. // optional bias tensors
  499. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  500. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  501. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  502. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  503. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  504. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  505. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  506. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  507. }
  508. else {
  509. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  510. }
  511. if (n_expert == 0) {
  512. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  513. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  514. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  515. // optional MLP bias
  516. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  517. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  518. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  519. } else {
  520. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  521. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
  522. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  523. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  524. }
  525. }
  526. } break;
  527. case LLM_ARCH_MLLAMA:
  528. {
  529. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
  530. // output
  531. {
  532. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  533. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  534. // if output is NULL, init from the input tok embed
  535. if (model.output == NULL) {
  536. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  537. }
  538. }
  539. for (int i = 0; i < n_layer; ++i) {
  540. auto & layer = model.layers[i];
  541. if (hparams.cross_attention_layers(i)) {
  542. layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
  543. layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
  544. layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
  545. layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
  546. layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
  547. layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
  548. layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
  549. layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
  550. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  551. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  552. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  553. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  554. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  555. } else {
  556. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  557. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  558. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  559. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  560. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  561. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  562. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  563. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  564. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  565. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  566. }
  567. }
  568. } break;
  569. case LLM_ARCH_DECI:
  570. {
  571. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  572. // output
  573. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  574. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  575. // if output is NULL, init from the input tok embed
  576. if (model.output == NULL) {
  577. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  578. }
  579. for (int i = 0; i < n_layer; ++i) {
  580. auto & layer = model.layers[i];
  581. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  582. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  583. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  584. const int64_t n_ff = hparams.n_ff(i);
  585. const int64_t n_head = hparams.n_head(i);
  586. const int64_t n_head_kv = hparams.n_head_kv(i);
  587. if (n_head_kv == 0 && n_head > 0) {
  588. // linear attention for DeciLMCausalModel
  589. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  590. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  591. }
  592. else if (n_head_kv > 0) {
  593. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  594. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  595. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  596. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  597. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  598. }
  599. // optional bias tensors
  600. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  601. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  602. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  603. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  604. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  605. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  606. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  607. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  608. }
  609. else {
  610. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  611. }
  612. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  613. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  614. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  615. // optional MLP bias
  616. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  617. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  618. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  619. }
  620. } break;
  621. case LLM_ARCH_MINICPM3:
  622. {
  623. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  624. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  625. const int64_t q_lora_rank = hparams.n_lora_q;
  626. const int64_t kv_lora_rank = hparams.n_lora_kv;
  627. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  628. // output
  629. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  630. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  631. // if output is NULL, init from the input tok embed
  632. if (model.output == NULL) {
  633. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  634. }
  635. for (int i = 0; i < n_layer; ++i) {
  636. auto & layer = model.layers[i];
  637. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  638. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  639. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  640. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  641. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  642. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  643. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  644. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  645. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  646. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  647. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  648. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  649. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  650. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  651. }
  652. } break;
  653. case LLM_ARCH_GROK:
  654. {
  655. if (n_expert == 0) {
  656. throw std::runtime_error("Grok model cannot have zero experts");
  657. }
  658. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  659. // output
  660. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  661. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  662. // if output is NULL, init from the input tok embed
  663. if (model.output == NULL) {
  664. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  665. }
  666. for (int i = 0; i < n_layer; ++i) {
  667. auto & layer = model.layers[i];
  668. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  669. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  670. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  671. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  672. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  673. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  674. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  675. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  676. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
  677. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  678. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  679. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  680. }
  681. } break;
  682. case LLM_ARCH_DBRX:
  683. {
  684. if (n_expert == 0) {
  685. throw std::runtime_error("DBRX model cannot have zero experts");
  686. }
  687. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  688. // output
  689. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  690. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  691. for (int i = 0; i < n_layer; ++i) {
  692. auto & layer = model.layers[i];
  693. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  694. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  695. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  696. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  697. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  698. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  699. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  700. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  701. }
  702. } break;
  703. case LLM_ARCH_BAICHUAN:
  704. {
  705. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  706. {
  707. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  708. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  709. }
  710. for (int i = 0; i < n_layer; ++i) {
  711. auto & layer = model.layers[i];
  712. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  713. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  714. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  715. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  716. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  717. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  718. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  719. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  720. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  721. }
  722. } break;
  723. case LLM_ARCH_FALCON:
  724. {
  725. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  726. // output
  727. {
  728. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  729. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  730. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  731. if (!model.output) {
  732. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
  733. }
  734. }
  735. for (int i = 0; i < n_layer; ++i) {
  736. auto & layer = model.layers[i];
  737. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  738. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  739. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  740. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  741. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  742. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  743. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  744. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  745. }
  746. } break;
  747. case LLM_ARCH_STARCODER:
  748. {
  749. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  750. model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  751. // output
  752. {
  753. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  754. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  755. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  756. if (!model.output) {
  757. // needs to be on GPU
  758. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  759. }
  760. }
  761. for (int i = 0; i < n_layer; ++i) {
  762. auto & layer = model.layers[i];
  763. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  764. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  765. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  766. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  767. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  768. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  769. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  770. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  771. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  772. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  773. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  774. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  775. }
  776. } break;
  777. case LLM_ARCH_BERT:
  778. case LLM_ARCH_NOMIC_BERT:
  779. {
  780. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  781. model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0);
  782. if (model.arch == LLM_ARCH_BERT) {
  783. model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  784. model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  785. model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  786. model.cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  787. model.cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  788. }
  789. model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  790. model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  791. for (int i = 0; i < n_layer; ++i) {
  792. auto & layer = model.layers[i];
  793. if (model.arch == LLM_ARCH_BERT) {
  794. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  795. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  796. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  797. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  798. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  799. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  800. } else {
  801. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  802. }
  803. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  804. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  805. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  806. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  807. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  808. if (model.arch == LLM_ARCH_BERT) {
  809. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  810. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  811. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  812. } else {
  813. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  814. }
  815. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  816. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  817. }
  818. } break;
  819. case LLM_ARCH_JINA_BERT_V2:
  820. {
  821. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  822. model.type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}, 0); // token_type_embeddings
  823. model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  824. model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  825. model.cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  826. model.cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  827. for (int i = 0; i < n_layer; ++i) {
  828. auto & layer = model.layers[i]; // JinaBertLayer
  829. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  830. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  831. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  832. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  833. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  834. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  835. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  836. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  837. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  838. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  839. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  840. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  841. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  842. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  843. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  844. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  845. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  846. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  847. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  848. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  849. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  850. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  851. }
  852. } break;
  853. case LLM_ARCH_BLOOM:
  854. {
  855. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  856. model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  857. model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  858. // output
  859. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  860. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  861. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  862. for (int i = 0; i < n_layer; ++i) {
  863. auto & layer = model.layers[i];
  864. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  865. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  866. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  867. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  868. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  869. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  870. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  871. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  872. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  873. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  874. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  875. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  876. }
  877. } break;
  878. case LLM_ARCH_MPT:
  879. {
  880. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  881. model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
  882. // output
  883. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  884. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  885. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  886. if (!model.output) {
  887. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
  888. }
  889. for (int i = 0; i < n_layer; ++i) {
  890. auto & layer = model.layers[i];
  891. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  892. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  893. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  894. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  895. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  896. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  897. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  898. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  899. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  900. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  901. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  902. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  903. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  904. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  905. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  906. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  907. // AWQ ScaleActivation layer
  908. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  909. }
  910. } break;
  911. case LLM_ARCH_STABLELM:
  912. {
  913. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  914. // output
  915. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  916. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  917. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  918. for (int i = 0; i < n_layer; ++i) {
  919. auto & layer = model.layers[i];
  920. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  921. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  922. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  923. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  924. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  925. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  926. // optional bias tensors, present in Stable LM 2 1.6B
  927. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  928. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  929. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  930. // optional q and k layernorms, present in StableLM 2 12B
  931. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
  932. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
  933. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  934. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  935. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  936. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  937. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  938. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  939. }
  940. } break;
  941. case LLM_ARCH_QWEN:
  942. {
  943. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  944. // output
  945. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  946. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  947. for (int i = 0; i < n_layer; ++i) {
  948. auto & layer = model.layers[i];
  949. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  950. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  951. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  952. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  953. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  954. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  955. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  956. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  957. }
  958. } break;
  959. case LLM_ARCH_QWEN2:
  960. case LLM_ARCH_QWEN2VL:
  961. {
  962. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  963. // output
  964. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  965. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  966. // if output is NULL, init from the input tok embed
  967. if (model.output == NULL) {
  968. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  969. }
  970. for (int i = 0; i < n_layer; ++i) {
  971. auto & layer = model.layers[i];
  972. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  973. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  974. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  975. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  976. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  977. // optional bias tensors
  978. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  979. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  980. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  981. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  982. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  983. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  984. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  985. }
  986. } break;
  987. case LLM_ARCH_QWEN2MOE:
  988. {
  989. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  990. // output
  991. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  992. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  993. for (int i = 0; i < n_layer; ++i) {
  994. auto & layer = model.layers[i];
  995. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  996. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  997. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  998. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  999. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1000. // optional bias tensors
  1001. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1002. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1003. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1004. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1005. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1006. if (n_expert == 0) {
  1007. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  1008. }
  1009. if (n_expert_used == 0) {
  1010. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  1011. }
  1012. // MoE branch
  1013. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  1014. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1015. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  1016. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1017. // Shared expert branch
  1018. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  1019. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  1020. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1021. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  1022. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1023. }
  1024. } break;
  1025. case LLM_ARCH_PHI2:
  1026. {
  1027. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1028. // output
  1029. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1030. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1031. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1032. model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  1033. for (int i = 0; i < n_layer; ++i) {
  1034. auto & layer = model.layers[i];
  1035. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1036. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1037. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1038. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1039. if (layer.wqkv == nullptr) {
  1040. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1041. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1042. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1043. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1044. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1045. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1046. }
  1047. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1048. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1049. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1050. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1051. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1052. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1053. }
  1054. } break;
  1055. case LLM_ARCH_PHI3:
  1056. {
  1057. const int64_t n_embd_head = n_embd / n_head;
  1058. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  1059. // output
  1060. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  1061. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  1062. for (int i = 0; i < n_layer; ++i) {
  1063. auto & layer = model.layers[i];
  1064. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  1065. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
  1066. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  1067. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  1068. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  1069. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  1070. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  1071. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  1072. }
  1073. } break;
  1074. case LLM_ARCH_PLAMO:
  1075. {
  1076. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1077. // output
  1078. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1079. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1080. for (int i = 0; i < n_layer; ++i) {
  1081. auto & layer = model.layers[i];
  1082. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1083. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1084. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1085. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1086. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1087. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1088. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1089. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1090. }
  1091. } break;
  1092. case LLM_ARCH_GPT2:
  1093. {
  1094. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1095. model.pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1096. // output
  1097. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1098. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1099. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1100. for (int i = 0; i < n_layer; ++i) {
  1101. auto & layer = model.layers[i];
  1102. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1103. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1104. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1105. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1106. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1107. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1108. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1109. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1110. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1111. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1112. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1113. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1114. }
  1115. } break;
  1116. case LLM_ARCH_CODESHELL:
  1117. {
  1118. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1119. // output
  1120. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1121. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1122. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1123. for (int i = 0; i < n_layer; ++i) {
  1124. auto & layer = model.layers[i];
  1125. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1126. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1127. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1128. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1129. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1130. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1131. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1132. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1133. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1134. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1135. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1136. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1137. }
  1138. } break;
  1139. case LLM_ARCH_ORION:
  1140. {
  1141. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1142. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1143. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1144. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1145. for (int i = 0; i < n_layer; ++i) {
  1146. auto & layer = model.layers[i];
  1147. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1148. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1149. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1150. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1151. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1152. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1153. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1154. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1155. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1156. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1157. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1158. }
  1159. } break;
  1160. case LLM_ARCH_INTERNLM2:
  1161. {
  1162. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1163. // output
  1164. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1165. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1166. for (int i = 0; i < n_layer; ++i) {
  1167. auto & layer = model.layers[i];
  1168. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1169. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1170. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1171. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1172. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1173. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1174. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1175. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1176. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1177. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1178. }
  1179. } break;
  1180. case LLM_ARCH_GEMMA:
  1181. {
  1182. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1183. // output
  1184. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1185. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  1186. for (int i = 0; i < n_layer; ++i) {
  1187. auto & layer = model.layers[i];
  1188. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1189. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1190. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1191. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1192. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1193. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1194. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1195. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1196. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1197. }
  1198. } break;
  1199. case LLM_ARCH_GEMMA2:
  1200. {
  1201. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1202. // output
  1203. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1204. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  1205. for (int i = 0; i < n_layer; ++i) {
  1206. auto & layer = model.layers[i];
  1207. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1208. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1209. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1210. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1211. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1212. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  1213. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1214. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1215. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1216. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1217. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  1218. }
  1219. } break;
  1220. case LLM_ARCH_STARCODER2:
  1221. {
  1222. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1223. // output
  1224. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1225. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1226. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1227. // if output is NULL, init from the input tok embed
  1228. if (model.output == NULL) {
  1229. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1230. }
  1231. for (int i = 0; i < n_layer; ++i) {
  1232. auto & layer = model.layers[i];
  1233. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1234. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1235. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1236. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1237. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1238. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1239. // optional bias tensors
  1240. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1241. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1242. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1243. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1244. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1245. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1246. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1247. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1248. // optional bias tensors
  1249. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1250. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  1251. }
  1252. } break;
  1253. case LLM_ARCH_MAMBA:
  1254. {
  1255. const int64_t d_conv = hparams.ssm_d_conv;
  1256. const int64_t d_inner = hparams.ssm_d_inner;
  1257. const int64_t d_state = hparams.ssm_d_state;
  1258. const int64_t dt_rank = hparams.ssm_dt_rank;
  1259. // only an expansion factor of 2 is supported for now
  1260. if (2 * n_embd != d_inner) {
  1261. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  1262. }
  1263. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1264. // output
  1265. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1266. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1267. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  1268. if (model.output == NULL) {
  1269. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1270. }
  1271. for (int i = 0; i < n_layer; ++i) {
  1272. auto & layer = model.layers[i];
  1273. // norm
  1274. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1275. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  1276. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  1277. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  1278. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  1279. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  1280. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  1281. // no "weight" suffix for these
  1282. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  1283. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  1284. // out_proj
  1285. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  1286. }
  1287. } break;
  1288. case LLM_ARCH_XVERSE:
  1289. {
  1290. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1291. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1292. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1293. for (int i = 0; i < n_layer; ++i) {
  1294. auto & layer = model.layers[i];
  1295. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1296. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1297. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1298. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1299. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1300. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1301. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1302. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1303. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1304. }
  1305. } break;
  1306. case LLM_ARCH_COMMAND_R:
  1307. {
  1308. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1309. // output
  1310. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1311. // init output from the input tok embed
  1312. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1313. for (int i = 0; i < n_layer; ++i) {
  1314. auto & layer = model.layers[i];
  1315. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1316. if (n_layer >= 64){
  1317. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  1318. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  1319. }
  1320. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1321. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1322. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1323. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1324. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1325. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1326. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1327. }
  1328. } break;
  1329. case LLM_ARCH_COHERE2:
  1330. {
  1331. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  1332. // output
  1333. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  1334. // init output from the input tok embed
  1335. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  1336. llama_model_loader::TENSOR_DUPLICATED);
  1337. for (int i = 0; i < n_layer; ++i) {
  1338. auto & layer = model.layers[i];
  1339. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  1340. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  1341. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  1342. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  1343. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  1344. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  1345. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  1346. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  1347. }
  1348. }
  1349. break;
  1350. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  1351. {
  1352. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1353. // output
  1354. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1355. // if output is NULL, init from the input tok embed
  1356. if (model.output == NULL) {
  1357. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1358. }
  1359. for (int i = 0; i < n_layer; ++i) {
  1360. auto & layer = model.layers[i];
  1361. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1362. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1363. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1364. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1365. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1366. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1367. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1368. }
  1369. } break;
  1370. case LLM_ARCH_OLMO2:
  1371. {
  1372. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1373. // output
  1374. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1375. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1376. for (int i = 0; i < n_layer; ++i) {
  1377. auto & layer = model.layers[i];
  1378. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1379. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1380. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1381. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1382. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  1383. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  1384. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  1385. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1386. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1387. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1388. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  1389. }
  1390. } break;
  1391. case LLM_ARCH_OLMOE:
  1392. {
  1393. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1394. // output
  1395. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1396. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1397. for (int i = 0; i < n_layer; ++i) {
  1398. auto & layer = model.layers[i];
  1399. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1400. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1401. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1402. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1403. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1404. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  1405. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  1406. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1407. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1408. if (n_expert == 0) {
  1409. throw std::runtime_error("n_expert must be > 0");
  1410. }
  1411. if (n_expert_used == 0) {
  1412. throw std::runtime_error("n_expert_used must be > 0");
  1413. }
  1414. // MoE branch
  1415. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1416. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  1417. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1418. }
  1419. } break;
  1420. case LLM_ARCH_OPENELM:
  1421. {
  1422. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1423. // output
  1424. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1425. // init output from the input tok embed
  1426. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1427. for (int i = 0; i < n_layer; ++i) {
  1428. const int64_t n_head = hparams.n_head(i);
  1429. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  1430. const int64_t n_ff = hparams.n_ff(i);
  1431. auto & layer = model.layers[i];
  1432. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1433. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  1434. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  1435. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  1436. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  1437. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1438. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1439. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1440. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1441. }
  1442. } break;
  1443. case LLM_ARCH_GPTNEOX:
  1444. {
  1445. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1446. // output
  1447. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1448. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1449. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1450. for (int i = 0; i < n_layer; ++i) {
  1451. auto & layer = model.layers[i];
  1452. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1453. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1454. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1455. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1456. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1457. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1458. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1459. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1460. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1461. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1462. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1463. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1464. }
  1465. } break;
  1466. case LLM_ARCH_ARCTIC:
  1467. {
  1468. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1469. // output
  1470. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1471. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1472. // if output is NULL, init from the input tok embed
  1473. if (model.output == NULL) {
  1474. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1475. }
  1476. for (int i = 0; i < n_layer; ++i) {
  1477. auto & layer = model.layers[i];
  1478. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1479. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1480. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1481. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1482. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1483. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1484. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  1485. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  1486. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  1487. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1488. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  1489. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  1490. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1491. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1492. }
  1493. } break;
  1494. case LLM_ARCH_DEEPSEEK:
  1495. {
  1496. const int64_t n_ff_exp = hparams.n_ff_exp;
  1497. const int64_t n_expert_shared = hparams.n_expert_shared;
  1498. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1499. // output
  1500. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1501. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1502. for (int i = 0; i < n_layer; ++i) {
  1503. auto & layer = model.layers[i];
  1504. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1505. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1506. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1507. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1508. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1509. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1510. if (i < (int) hparams.n_layer_dense_lead) {
  1511. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1512. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1513. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1514. } else {
  1515. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1516. if (n_expert == 0) {
  1517. throw std::runtime_error("n_expert must be > 0");
  1518. }
  1519. if (n_expert_used == 0) {
  1520. throw std::runtime_error("n_expert_used must be > 0");
  1521. }
  1522. // MoE branch
  1523. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1524. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  1525. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1526. // Shared expert branch
  1527. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  1528. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  1529. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  1530. }
  1531. }
  1532. } break;
  1533. case LLM_ARCH_DEEPSEEK2:
  1534. {
  1535. const bool is_lite = (hparams.n_layer == 27);
  1536. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  1537. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  1538. const int64_t q_lora_rank = hparams.n_lora_q;
  1539. const int64_t kv_lora_rank = hparams.n_lora_kv;
  1540. const int64_t n_ff_exp = hparams.n_ff_exp;
  1541. const int64_t n_expert_shared = hparams.n_expert_shared;
  1542. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1543. // output
  1544. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1545. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1546. for (int i = 0; i < n_layer; ++i) {
  1547. auto & layer = model.layers[i];
  1548. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1549. if (!is_lite) {
  1550. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  1551. }
  1552. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  1553. if (!is_lite) {
  1554. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  1555. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  1556. } else {
  1557. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1558. }
  1559. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  1560. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  1561. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  1562. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1563. if (i < (int) hparams.n_layer_dense_lead) {
  1564. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1565. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1566. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1567. } else {
  1568. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1569. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1570. if (n_expert == 0) {
  1571. throw std::runtime_error("n_expert must be > 0");
  1572. }
  1573. if (n_expert_used == 0) {
  1574. throw std::runtime_error("n_expert_used must be > 0");
  1575. }
  1576. // MoE branch
  1577. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1578. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  1579. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1580. // Shared expert branch
  1581. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  1582. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  1583. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  1584. }
  1585. }
  1586. } break;
  1587. case LLM_ARCH_BITNET:
  1588. {
  1589. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1590. // output
  1591. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1592. for (int i = 0; i < n_layer; ++i) {
  1593. auto & layer = model.layers[i];
  1594. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1595. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  1596. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1597. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1598. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1599. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1600. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1601. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1602. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1603. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1604. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1605. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  1606. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1607. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1608. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1609. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1610. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1611. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1612. }
  1613. } break;
  1614. case LLM_ARCH_T5:
  1615. {
  1616. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  1617. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1618. // output
  1619. model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1620. model.output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1621. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1622. // if output is NULL, init from the input tok embed
  1623. if (model.output == NULL) {
  1624. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1625. }
  1626. for (int i = 0; i < n_layer; ++i) {
  1627. auto & layer = model.layers[i];
  1628. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  1629. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1630. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1631. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1632. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1633. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  1634. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  1635. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1636. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1637. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1638. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  1639. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1640. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1641. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1642. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1643. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  1644. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  1645. // this tensor seems to be unused in HF transformers implementation
  1646. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1647. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1648. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1649. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1650. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  1651. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  1652. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1653. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1654. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1655. }
  1656. } break;
  1657. case LLM_ARCH_T5ENCODER:
  1658. {
  1659. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  1660. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1661. // output
  1662. model.output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1663. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1664. // if output is NULL, init from the input tok embed
  1665. if (model.output == NULL) {
  1666. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1667. }
  1668. for (int i = 0; i < n_layer; ++i) {
  1669. auto & layer = model.layers[i];
  1670. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  1671. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1672. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1673. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1674. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1675. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  1676. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  1677. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1678. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1679. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1680. }
  1681. } break;
  1682. case LLM_ARCH_JAIS:
  1683. {
  1684. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1685. // output
  1686. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1687. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1688. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1689. for (int i = 0; i < n_layer; ++i) {
  1690. auto & layer = model.layers[i];
  1691. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1692. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1693. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1694. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1695. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1696. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1697. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1698. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1699. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1700. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1701. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1702. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  1703. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1704. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1705. }
  1706. } break;
  1707. case LLM_ARCH_CHATGLM:
  1708. {
  1709. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1710. // output
  1711. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1712. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1713. for (int i = 0; i < n_layer; ++i) {
  1714. auto & layer = model.layers[i];
  1715. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1716. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1717. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1718. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1719. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1720. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  1721. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1722. }
  1723. } break;
  1724. case LLM_ARCH_NEMOTRON:
  1725. {
  1726. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1727. // output
  1728. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1729. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1730. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1731. for (int i = 0; i < n_layer; ++i) {
  1732. auto & layer = model.layers[i];
  1733. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1734. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1735. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1736. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1737. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1738. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1739. // optional bias tensors
  1740. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1741. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1742. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1743. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1744. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1745. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1746. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1747. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1748. // optional MLP bias
  1749. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1750. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1751. }
  1752. } break;
  1753. case LLM_ARCH_EXAONE:
  1754. {
  1755. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1756. // output
  1757. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1758. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1759. for (int i = 0; i < n_layer; ++i) {
  1760. auto & layer = model.layers[i];
  1761. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1762. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1763. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1764. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1765. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1766. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1767. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  1768. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1769. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1770. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1771. }
  1772. } break;
  1773. case LLM_ARCH_RWKV6:
  1774. {
  1775. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1776. // Block 0, LN0
  1777. model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1778. model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1779. // output
  1780. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1781. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1782. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1783. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  1784. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  1785. const int head_size = hparams.wkv_head_size;
  1786. const int attn_hidden_size = n_embd;
  1787. const int ffn_size = hparams.n_ff_arr[0];
  1788. for (int i = 0; i < n_layer; ++i) {
  1789. auto & layer = model.layers[i];
  1790. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1791. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1792. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  1793. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  1794. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  1795. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  1796. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  1797. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
  1798. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  1799. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
  1800. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  1801. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
  1802. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  1803. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  1804. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  1805. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  1806. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  1807. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  1808. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  1809. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  1810. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  1811. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  1812. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  1813. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  1814. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  1815. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  1816. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  1817. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  1818. }
  1819. } break;
  1820. case LLM_ARCH_CHAMELEON:
  1821. {
  1822. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1823. // output
  1824. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1825. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1826. // if output is NULL, init from the input tok embed
  1827. if (model.output == NULL) {
  1828. model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1829. }
  1830. for (int i = 0; i < n_layer; ++i) {
  1831. auto & layer = model.layers[i];
  1832. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1833. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  1834. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  1835. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1836. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1837. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1838. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1839. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1840. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1841. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1842. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1843. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1844. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1845. }
  1846. } break;
  1847. case LLM_ARCH_SOLAR:
  1848. {
  1849. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1850. // output
  1851. {
  1852. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1853. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1854. }
  1855. for (int i = 0; i < n_layer; ++i) {
  1856. auto & layer = model.layers[i];
  1857. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1858. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1859. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1860. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1861. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1862. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1863. layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  1864. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1865. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1866. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1867. }
  1868. } break;
  1869. case LLM_ARCH_WAVTOKENIZER_DEC:
  1870. {
  1871. model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  1872. model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  1873. model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  1874. // posnet
  1875. {
  1876. const int64_t n_embd = hparams.posnet.n_embd;
  1877. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  1878. auto & layer = model.layers[i].posnet;
  1879. // posnet:
  1880. //
  1881. // - resnet
  1882. // - resnet
  1883. // - attn
  1884. // - resnet
  1885. // - resnet
  1886. // - norm
  1887. //
  1888. switch (i) {
  1889. case 0:
  1890. case 1:
  1891. case 3:
  1892. case 4:
  1893. {
  1894. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  1895. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  1896. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  1897. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  1898. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  1899. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  1900. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  1901. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  1902. } break;
  1903. case 2:
  1904. {
  1905. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  1906. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  1907. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  1908. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  1909. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  1910. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  1911. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  1912. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  1913. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  1914. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  1915. } break;
  1916. case 5:
  1917. {
  1918. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  1919. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  1920. } break;
  1921. default: GGML_ABORT("unknown posnet layer");
  1922. };
  1923. }
  1924. }
  1925. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  1926. model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  1927. model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  1928. // convnext
  1929. {
  1930. const int64_t n_embd = hparams.convnext.n_embd;
  1931. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  1932. auto & layer = model.layers[i].convnext;
  1933. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  1934. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  1935. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  1936. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  1937. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  1938. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  1939. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  1940. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  1941. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  1942. }
  1943. // output
  1944. model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1945. model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1946. }
  1947. model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  1948. model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  1949. } break;
  1950. default:
  1951. throw std::runtime_error("unknown architecture");
  1952. }
  1953. if (n_moved_tensors > 0) {
  1954. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  1955. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  1956. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  1957. }
  1958. }
  1959. ml.done_getting_tensors();
  1960. ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
  1961. model.mappings.reserve(ml.mappings.size());
  1962. // create the backend buffers
  1963. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  1964. ctx_bufs.reserve(ctx_map.size());
  1965. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  1966. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  1967. model.bufs.reserve(n_max_backend_buffer);
  1968. for (auto & it : ctx_map) {
  1969. ggml_backend_buffer_type_t buft = it.first;
  1970. ggml_context * ctx = it.second;
  1971. // skip contexts without tensors
  1972. if (ggml_get_first_tensor(ctx) == nullptr) {
  1973. continue;
  1974. }
  1975. llama_buf_map bufs;
  1976. bufs.reserve(n_max_backend_buffer);
  1977. // check if it is possible to use buffer_from_host_ptr with this buffer type
  1978. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  1979. if (!dev) {
  1980. // FIXME: workaround for CPU backend buft having a NULL device
  1981. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1982. }
  1983. ggml_backend_dev_props props;
  1984. ggml_backend_dev_get_props(dev, &props);
  1985. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  1986. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  1987. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  1988. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  1989. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  1990. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  1991. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  1992. void * addr = nullptr;
  1993. size_t first, last; // NOLINT
  1994. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  1995. if (first >= last) {
  1996. continue;
  1997. }
  1998. const size_t max_size = ggml_get_max_tensor_size(ctx);
  1999. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  2000. if (buf == nullptr) {
  2001. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  2002. }
  2003. model.bufs.emplace_back(buf);
  2004. bufs.emplace(idx, buf);
  2005. }
  2006. }
  2007. else {
  2008. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  2009. if (buf == nullptr) {
  2010. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  2011. }
  2012. model.bufs.emplace_back(buf);
  2013. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  2014. model.mlock_bufs.emplace_back(new llama_mlock);
  2015. auto & mlock_buf = model.mlock_bufs.back();
  2016. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  2017. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  2018. }
  2019. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  2020. bufs.emplace(idx, buf);
  2021. }
  2022. }
  2023. if (bufs.empty()) {
  2024. throw std::runtime_error("failed to allocate buffer");
  2025. }
  2026. for (auto & buf : bufs) {
  2027. // indicate that this buffer contains weights
  2028. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  2029. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  2030. }
  2031. ctx_bufs.emplace_back(ctx, bufs);
  2032. }
  2033. if (llama_supports_gpu_offload()) {
  2034. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  2035. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  2036. if (n_gpu_layers > (int) hparams.n_layer) {
  2037. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  2038. }
  2039. const int max_backend_supported_layers = hparams.n_layer + 1;
  2040. const int max_offloadable_layers = hparams.n_layer + 1;
  2041. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  2042. }
  2043. // print memory requirements per buffer type
  2044. for (auto & buf : model.bufs) {
  2045. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  2046. }
  2047. // populate tensors_by_name
  2048. for (auto & ctx : model.ctxs) {
  2049. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  2050. model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  2051. }
  2052. }
  2053. // load tensor data
  2054. for (auto & it : ctx_bufs) {
  2055. ggml_context * ctx = it.first;
  2056. auto & bufs = it.second;
  2057. if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
  2058. return false;
  2059. }
  2060. }
  2061. if (use_mmap_buffer) {
  2062. for (auto & mapping : ml.mappings) {
  2063. model.mappings.emplace_back(std::move(mapping));
  2064. }
  2065. }
  2066. return true;
  2067. }
  2068. // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
  2069. static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
  2070. model.t_start_us = ggml_time_us();
  2071. try {
  2072. llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
  2073. model.hparams.vocab_only = params.vocab_only;
  2074. try {
  2075. llm_load_arch(ml, model);
  2076. } catch(const std::exception & e) {
  2077. throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
  2078. }
  2079. try {
  2080. llm_load_hparams(ml, model);
  2081. } catch(const std::exception & e) {
  2082. throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
  2083. }
  2084. try {
  2085. llm_load_vocab(ml, model);
  2086. } catch(const std::exception & e) {
  2087. throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
  2088. }
  2089. llm_load_stats(ml, model);
  2090. llm_load_print_meta(ml, model);
  2091. if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
  2092. model.hparams.n_vocab != model.vocab.id_to_token.size()) {
  2093. LLAMA_LOG_WARN("%s: vocab mismatch %u !- %zu ...\n", __func__, model.hparams.n_vocab, model.vocab.id_to_token.size());
  2094. }
  2095. if (params.vocab_only) {
  2096. LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
  2097. return 0;
  2098. }
  2099. if (!llm_load_tensors(
  2100. ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
  2101. params.progress_callback, params.progress_callback_user_data
  2102. )) {
  2103. return -2;
  2104. }
  2105. } catch (const std::exception & err) {
  2106. LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
  2107. return -1;
  2108. }
  2109. // loading time will be recalculate after the first eval, so
  2110. // we take page faults deferred by mmap() into consideration
  2111. model.t_load_us = ggml_time_us() - model.t_start_us;
  2112. return 0;
  2113. }
  2114. //
  2115. // llm_build
  2116. //
  2117. using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
  2118. enum llm_ffn_op_type {
  2119. LLM_FFN_SILU,
  2120. LLM_FFN_GELU,
  2121. LLM_FFN_RELU,
  2122. LLM_FFN_RELU_SQR,
  2123. LLM_FFN_SWIGLU,
  2124. };
  2125. enum llm_ffn_gate_type {
  2126. LLM_FFN_SEQ,
  2127. LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
  2128. };
  2129. enum llm_norm_type {
  2130. LLM_NORM,
  2131. LLM_NORM_RMS,
  2132. LLM_NORM_GROUP,
  2133. };
  2134. static struct ggml_tensor * llm_build_inp_embd(
  2135. struct ggml_context * ctx,
  2136. struct llama_context & lctx,
  2137. const llama_hparams & hparams,
  2138. const llama_ubatch & batch,
  2139. struct ggml_tensor * tok_embd,
  2140. const llm_build_cb & cb) {
  2141. const int64_t n_embd = hparams.n_embd;
  2142. struct ggml_tensor * inpL;
  2143. if (batch.token) {
  2144. lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
  2145. cb(lctx.inp_tokens, "inp_tokens", -1);
  2146. ggml_set_input(lctx.inp_tokens);
  2147. inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
  2148. } else {
  2149. lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
  2150. inpL = lctx.inp_embd;
  2151. ggml_set_input(lctx.inp_embd);
  2152. }
  2153. // For Granite architecture
  2154. if (hparams.f_embedding_scale != 0.0f) {
  2155. inpL = ggml_scale(ctx, inpL, hparams.f_embedding_scale);
  2156. }
  2157. cb(inpL, "inp_embd", -1);
  2158. return inpL;
  2159. }
  2160. static struct ggml_tensor * llm_build_inp_cross_attn_state(
  2161. struct ggml_context * ctx,
  2162. struct llama_context & lctx,
  2163. const llama_hparams & hparams,
  2164. const llm_build_cb & cb) {
  2165. const int64_t n_embd = hparams.n_embd;
  2166. struct ggml_tensor * inpCAS = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, 1601, 4);
  2167. cb(inpCAS, "inp_cross_attn_state", -1);
  2168. ggml_set_input(inpCAS);
  2169. lctx.inp_cross_attn_state = inpCAS;
  2170. return inpCAS;
  2171. }
  2172. static void llm_build_kv_store(
  2173. struct ggml_context * ctx,
  2174. const llama_hparams & hparams,
  2175. const llama_cparams & cparams,
  2176. const llama_kv_cache & kv,
  2177. struct ggml_cgraph * graph,
  2178. struct ggml_tensor * k_cur,
  2179. struct ggml_tensor * v_cur,
  2180. int32_t n_tokens,
  2181. int32_t kv_head,
  2182. const llm_build_cb & cb,
  2183. int64_t il) {
  2184. const int64_t n_ctx = cparams.n_ctx;
  2185. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  2186. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  2187. GGML_ASSERT(kv.size == n_ctx);
  2188. struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
  2189. cb(k_cache_view, "k_cache_view", il);
  2190. // note: storing RoPE-ed version of K in the KV cache
  2191. ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
  2192. assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
  2193. struct ggml_tensor * v_cache_view = nullptr;
  2194. if (cparams.flash_attn) {
  2195. v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
  2196. } else {
  2197. // note: the V cache is transposed when not using flash attention
  2198. v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
  2199. ( n_ctx)*ggml_element_size(kv.v_l[il]),
  2200. (kv_head)*ggml_element_size(kv.v_l[il]));
  2201. v_cur = ggml_transpose(ctx, v_cur);
  2202. }
  2203. cb(v_cache_view, "v_cache_view", il);
  2204. ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
  2205. }
  2206. // do mat_mul, while optionally apply lora
  2207. static struct ggml_tensor * llm_build_lora_mm(
  2208. struct llama_context & lctx,
  2209. struct ggml_context * ctx0,
  2210. struct ggml_tensor * w,
  2211. struct ggml_tensor * cur) {
  2212. struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
  2213. for (auto & it : lctx.lora_adapters) {
  2214. struct llama_lora_weight * lora = it.first->get_weight(w);
  2215. if (lora == nullptr) {
  2216. continue;
  2217. }
  2218. const float alpha = it.first->alpha;
  2219. const float rank = (float) lora->b->ne[0];
  2220. const float scale = alpha ? it.second * alpha / rank : it.second;
  2221. struct ggml_tensor * ab_cur = ggml_mul_mat(
  2222. ctx0, lora->b,
  2223. ggml_mul_mat(ctx0, lora->a, cur)
  2224. );
  2225. ab_cur = ggml_scale(ctx0, ab_cur, scale);
  2226. res = ggml_add(ctx0, res, ab_cur);
  2227. }
  2228. return res;
  2229. }
  2230. // do mat_mul_id, while optionally apply lora
  2231. static struct ggml_tensor * llm_build_lora_mm_id(
  2232. struct llama_context & lctx,
  2233. struct ggml_context * ctx0,
  2234. struct ggml_tensor * w, // struct ggml_tensor * as
  2235. struct ggml_tensor * cur, // struct ggml_tensor * b
  2236. struct ggml_tensor * ids) {
  2237. struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
  2238. for (auto & it : lctx.lora_adapters) {
  2239. struct llama_lora_weight * lora = it.first->get_weight(w);
  2240. if (lora == nullptr) {
  2241. continue;
  2242. }
  2243. const float alpha = it.first->alpha;
  2244. const float rank = (float) lora->b->ne[0];
  2245. const float scale = alpha ? it.second * alpha / rank : it.second;
  2246. struct ggml_tensor * ab_cur = ggml_mul_mat_id(
  2247. ctx0, lora->b,
  2248. ggml_mul_mat_id(ctx0, lora->a, cur, ids),
  2249. ids
  2250. );
  2251. ab_cur = ggml_scale(ctx0, ab_cur, scale);
  2252. res = ggml_add(ctx0, res, ab_cur);
  2253. }
  2254. return res;
  2255. }
  2256. static struct ggml_tensor * llm_build_norm(
  2257. struct ggml_context * ctx,
  2258. struct ggml_tensor * cur,
  2259. const llama_hparams & hparams,
  2260. struct ggml_tensor * mw,
  2261. struct ggml_tensor * mb,
  2262. llm_norm_type type,
  2263. const llm_build_cb & cb,
  2264. int il) {
  2265. switch (type) {
  2266. case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
  2267. case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
  2268. case LLM_NORM_GROUP:
  2269. {
  2270. cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
  2271. cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
  2272. cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
  2273. } break;
  2274. }
  2275. if (mw || mb) {
  2276. cb(cur, "norm", il);
  2277. }
  2278. if (mw) {
  2279. cur = ggml_mul(ctx, cur, mw);
  2280. if (mb) {
  2281. cb(cur, "norm_w", il);
  2282. }
  2283. }
  2284. if (mb) {
  2285. cur = ggml_add(ctx, cur, mb);
  2286. }
  2287. return cur;
  2288. }
  2289. static struct ggml_tensor * llm_build_ffn(
  2290. struct ggml_context * ctx,
  2291. struct llama_context & lctx,
  2292. struct ggml_tensor * cur,
  2293. struct ggml_tensor * up,
  2294. struct ggml_tensor * up_b,
  2295. struct ggml_tensor * up_s,
  2296. struct ggml_tensor * gate,
  2297. struct ggml_tensor * gate_b,
  2298. struct ggml_tensor * gate_s,
  2299. struct ggml_tensor * down,
  2300. struct ggml_tensor * down_b,
  2301. struct ggml_tensor * down_s,
  2302. struct ggml_tensor * act_scales,
  2303. llm_ffn_op_type type_op,
  2304. llm_ffn_gate_type type_gate,
  2305. const llm_build_cb & cb,
  2306. int il) {
  2307. struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
  2308. cb(tmp, "ffn_up", il);
  2309. if (up_b) {
  2310. tmp = ggml_add(ctx, tmp, up_b);
  2311. cb(tmp, "ffn_up_b", il);
  2312. }
  2313. if (up_s) {
  2314. tmp = ggml_mul(ctx, tmp, up_s);
  2315. cb(tmp, "ffn_up_s", il);
  2316. }
  2317. if (gate) {
  2318. switch (type_gate) {
  2319. case LLM_FFN_SEQ:
  2320. {
  2321. cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
  2322. cb(cur, "ffn_gate", il);
  2323. } break;
  2324. case LLM_FFN_PAR:
  2325. {
  2326. cur = llm_build_lora_mm(lctx, ctx, gate, cur);
  2327. cb(cur, "ffn_gate", il);
  2328. } break;
  2329. }
  2330. if (gate_b) {
  2331. cur = ggml_add(ctx, cur, gate_b);
  2332. cb(cur, "ffn_gate_b", il);
  2333. }
  2334. if (gate_s) {
  2335. cur = ggml_mul(ctx, cur, gate_s);
  2336. cb(cur, "ffn_gate_s", il);
  2337. }
  2338. } else {
  2339. cur = tmp;
  2340. }
  2341. switch (type_op) {
  2342. case LLM_FFN_SILU:
  2343. {
  2344. cur = ggml_silu(ctx, cur);
  2345. cb(cur, "ffn_silu", il);
  2346. } break;
  2347. case LLM_FFN_GELU:
  2348. {
  2349. cur = ggml_gelu(ctx, cur);
  2350. cb(cur, "ffn_gelu", il);
  2351. if (act_scales != NULL) {
  2352. cur = ggml_div(ctx, cur, act_scales);
  2353. cb(cur, "ffn_act", il);
  2354. }
  2355. } break;
  2356. case LLM_FFN_RELU:
  2357. {
  2358. cur = ggml_relu(ctx, cur);
  2359. cb(cur, "ffn_relu", il);
  2360. } break;
  2361. case LLM_FFN_RELU_SQR:
  2362. {
  2363. cur = ggml_relu(ctx, cur);
  2364. cb(cur, "ffn_relu", il);
  2365. cur = ggml_sqr(ctx, cur);
  2366. cb(cur, "ffn_sqr(relu)", il);
  2367. } break;
  2368. case LLM_FFN_SWIGLU:
  2369. {
  2370. // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
  2371. int64_t split_point = cur->ne[0] / 2;
  2372. struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
  2373. struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
  2374. x0 = ggml_silu(ctx, x0);
  2375. cb(cur, "ffn_silu", il);
  2376. cur = ggml_mul(ctx, x0, x1);
  2377. cb(cur, "ffn_mul", il);
  2378. } break;
  2379. }
  2380. if (type_gate == LLM_FFN_PAR) {
  2381. cur = ggml_mul(ctx, cur, tmp);
  2382. cb(cur, "ffn_gate_par", il);
  2383. }
  2384. if (down) {
  2385. cur = llm_build_lora_mm(lctx, ctx, down, cur);
  2386. }
  2387. if (down_b) {
  2388. cb(cur, "ffn_down", il);
  2389. }
  2390. if (down_b) {
  2391. cur = ggml_add(ctx, cur, down_b);
  2392. }
  2393. if (down_s) {
  2394. cur = ggml_mul(ctx, cur, down_s);
  2395. cb(cur, "ffn_down_s", il);
  2396. }
  2397. return cur;
  2398. }
  2399. static struct ggml_tensor * llm_build_moe_ffn(
  2400. struct ggml_context * ctx,
  2401. struct llama_context & lctx,
  2402. struct ggml_tensor * cur,
  2403. struct ggml_tensor * gate_inp,
  2404. struct ggml_tensor * up_exps,
  2405. struct ggml_tensor * gate_exps,
  2406. struct ggml_tensor * down_exps,
  2407. struct ggml_tensor * exp_probs_b,
  2408. int64_t n_expert,
  2409. int64_t n_expert_used,
  2410. llm_ffn_op_type type_op,
  2411. bool norm_w,
  2412. bool scale_w,
  2413. float w_scale,
  2414. llama_expert_gating_func_type gating_op,
  2415. const llm_build_cb & cb,
  2416. int il) {
  2417. int64_t n_embd = cur->ne[0];
  2418. int64_t n_tokens = cur->ne[1];
  2419. ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
  2420. cb(logits, "ffn_moe_logits", il);
  2421. ggml_tensor * probs = nullptr;
  2422. switch (gating_op) {
  2423. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX:
  2424. {
  2425. probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
  2426. } break;
  2427. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID:
  2428. {
  2429. probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
  2430. } break;
  2431. default:
  2432. GGML_ABORT("fatal error");
  2433. }
  2434. cb(probs, "ffn_moe_probs", il);
  2435. // add experts selection bias - introduced in DeepSeek V3
  2436. // leave probs unbiased as it's later used to get expert weights
  2437. ggml_tensor * selection_probs = probs;
  2438. if (exp_probs_b != nullptr) {
  2439. selection_probs = ggml_add(ctx, probs, exp_probs_b);
  2440. cb(selection_probs, "ffn_moe_probs_biased", il);
  2441. }
  2442. // select experts
  2443. ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
  2444. cb(selected_experts->src[0], "ffn_moe_argsort", il);
  2445. cb(selected_experts, "ffn_moe_topk", il);
  2446. ggml_tensor * weights = ggml_get_rows(ctx,
  2447. ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
  2448. cb(weights, "ffn_moe_weights", il);
  2449. if (norm_w) {
  2450. weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
  2451. ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
  2452. cb(weights_sum, "ffn_moe_weights_sum", il);
  2453. weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
  2454. cb(weights, "ffn_moe_weights_norm", il);
  2455. weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
  2456. }
  2457. if (scale_w) {
  2458. weights = ggml_scale(ctx, weights, w_scale);
  2459. cb(weights, "ffn_moe_weights_scaled", il);
  2460. }
  2461. cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
  2462. ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
  2463. cb(up, "ffn_moe_up", il);
  2464. ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
  2465. cb(gate, "ffn_moe_gate", il);
  2466. switch (type_op) {
  2467. case LLM_FFN_SILU:
  2468. {
  2469. gate = ggml_silu(ctx, gate);
  2470. cb(gate, "ffn_moe_silu", il);
  2471. } break;
  2472. case LLM_FFN_GELU:
  2473. {
  2474. gate = ggml_gelu(ctx, gate);
  2475. cb(gate, "ffn_moe_gelu", il);
  2476. } break;
  2477. default:
  2478. GGML_ABORT("fatal error");
  2479. }
  2480. ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
  2481. cb(par, "ffn_moe_gate_par", il);
  2482. ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
  2483. cb(experts, "ffn_moe_down", il);
  2484. experts = ggml_mul(ctx, experts, weights);
  2485. // aggregate experts
  2486. ggml_tensor * moe_out = nullptr;
  2487. for (int i = 0; i < n_expert_used; ++i) {
  2488. ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
  2489. experts->nb[2], i*experts->nb[1]);
  2490. if (i == 0) {
  2491. moe_out = cur_expert;
  2492. } else {
  2493. moe_out = ggml_add(ctx, moe_out, cur_expert);
  2494. }
  2495. }
  2496. if (n_expert_used == 1) {
  2497. // avoid returning a non-contiguous tensor
  2498. moe_out = ggml_cont(ctx, moe_out);
  2499. }
  2500. return moe_out;
  2501. }
  2502. static struct ggml_tensor * llm_build_kqv(
  2503. struct ggml_context * ctx,
  2504. struct llama_context & lctx,
  2505. const llama_kv_cache & kv,
  2506. struct ggml_cgraph * graph,
  2507. struct ggml_tensor * wo,
  2508. struct ggml_tensor * wo_b,
  2509. struct ggml_tensor * q_cur,
  2510. struct ggml_tensor * kq_mask,
  2511. int32_t n_tokens,
  2512. int32_t n_kv,
  2513. float kq_scale,
  2514. const llm_build_cb & cb,
  2515. int il) {
  2516. const llama_model & model = lctx.model;
  2517. const llama_hparams & hparams = lctx.model.hparams;
  2518. const llama_cparams & cparams = lctx.cparams;
  2519. const int64_t n_ctx = cparams.n_ctx;
  2520. const int64_t n_head = hparams.n_head(il);
  2521. const int64_t n_head_kv = hparams.n_head_kv(il);
  2522. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  2523. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  2524. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  2525. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  2526. struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
  2527. cb(q, "q", il);
  2528. struct ggml_tensor * k =
  2529. ggml_view_3d(ctx, kv.k_l[il],
  2530. n_embd_head_k, n_kv, n_head_kv,
  2531. ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
  2532. ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
  2533. 0);
  2534. cb(k, "k", il);
  2535. struct ggml_tensor * cur;
  2536. if (cparams.flash_attn) {
  2537. GGML_UNUSED(model);
  2538. GGML_UNUSED(n_ctx);
  2539. // split cached v into n_head heads (not transposed)
  2540. struct ggml_tensor * v =
  2541. ggml_view_3d(ctx, kv.v_l[il],
  2542. n_embd_head_v, n_kv, n_head_kv,
  2543. ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
  2544. ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
  2545. 0);
  2546. cb(v, "v", il);
  2547. cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
  2548. hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
  2549. ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
  2550. cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
  2551. } else {
  2552. struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
  2553. cb(kq, "kq", il);
  2554. // note: this op tends to require high floating point range
  2555. // while for some models F16 is enough, for others it is not, so we default to F32 here
  2556. ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
  2557. if (model.arch == LLM_ARCH_GROK) {
  2558. // need to do the following:
  2559. // multiply by attn_output_multiplyer of 0.08838834764831845
  2560. // and then :
  2561. // kq = 30 * tanh(kq / 30)
  2562. // before the softmax below
  2563. kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
  2564. kq = ggml_scale(ctx, kq, 30);
  2565. }
  2566. if (hparams.attn_soft_cap) {
  2567. kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
  2568. kq = ggml_tanh(ctx, kq);
  2569. kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
  2570. }
  2571. kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
  2572. cb(kq, "kq_soft_max_ext", il);
  2573. GGML_ASSERT(kv.size == n_ctx);
  2574. // split cached v into n_head heads
  2575. struct ggml_tensor * v =
  2576. ggml_view_3d(ctx, kv.v_l[il],
  2577. n_kv, n_embd_head_v, n_head_kv,
  2578. ggml_element_size(kv.v_l[il])*n_ctx,
  2579. ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
  2580. 0);
  2581. cb(v, "v", il);
  2582. struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
  2583. cb(kqv, "kqv", il);
  2584. struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
  2585. cb(kqv_merged, "kqv_merged", il);
  2586. cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
  2587. cb(cur, "kqv_merged_cont", il);
  2588. }
  2589. ggml_build_forward_expand(graph, cur);
  2590. if (wo) {
  2591. cur = llm_build_lora_mm(lctx, ctx, wo, cur);
  2592. }
  2593. if (wo_b) {
  2594. cb(cur, "kqv_wo", il);
  2595. }
  2596. if (wo_b) {
  2597. cur = ggml_add(ctx, cur, wo_b);
  2598. }
  2599. return cur;
  2600. }
  2601. static struct ggml_tensor * llm_build_kv(
  2602. struct ggml_context * ctx,
  2603. struct llama_context & lctx,
  2604. const llama_kv_cache & kv,
  2605. struct ggml_cgraph * graph,
  2606. struct ggml_tensor * wo,
  2607. struct ggml_tensor * wo_b,
  2608. struct ggml_tensor * k_cur,
  2609. struct ggml_tensor * v_cur,
  2610. struct ggml_tensor * q_cur,
  2611. struct ggml_tensor * kq_mask,
  2612. int32_t n_tokens,
  2613. int32_t kv_head,
  2614. int32_t n_kv,
  2615. float kq_scale,
  2616. const llm_build_cb & cb,
  2617. int il) {
  2618. const llama_hparams & hparams = lctx.model.hparams;
  2619. const llama_cparams & cparams = lctx.cparams;
  2620. // these nodes are added to the graph together so that they are not reordered
  2621. // by doing so, the number of splits in the graph is reduced
  2622. ggml_build_forward_expand(graph, q_cur);
  2623. ggml_build_forward_expand(graph, k_cur);
  2624. ggml_build_forward_expand(graph, v_cur);
  2625. llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
  2626. struct ggml_tensor * cur;
  2627. cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
  2628. cb(cur, "kqv_out", il);
  2629. return cur;
  2630. }
  2631. static struct ggml_tensor * llm_build_copy_mask_state(
  2632. struct ggml_context * ctx,
  2633. struct ggml_cgraph * graph,
  2634. struct ggml_tensor * s,
  2635. struct ggml_tensor * state_copy,
  2636. struct ggml_tensor * state_mask,
  2637. int32_t n_state,
  2638. int32_t kv_size,
  2639. int32_t kv_head,
  2640. int32_t n_kv,
  2641. int32_t n_seqs) {
  2642. struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size);
  2643. // copy states
  2644. // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
  2645. // this shrinks the tensors's ne[1] to n_kv
  2646. states = ggml_get_rows(ctx, states, state_copy);
  2647. // clear states of sequences which are starting at the beginning of this batch
  2648. // FIXME: zero-out NANs?
  2649. states = ggml_mul(ctx, states, state_mask);
  2650. // copy states which won't be changed further (between n_seqs and n_kv)
  2651. ggml_build_forward_expand(graph,
  2652. ggml_cpy(ctx,
  2653. ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
  2654. ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
  2655. // the part of the states that will be used and modified
  2656. return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0);
  2657. }
  2658. // TODO: split
  2659. static struct ggml_tensor * llm_build_mamba(
  2660. struct ggml_context * ctx,
  2661. struct llama_context & lctx,
  2662. const llama_ubatch & batch,
  2663. struct ggml_cgraph * graph,
  2664. struct ggml_tensor * cur,
  2665. struct ggml_tensor * state_copy,
  2666. struct ggml_tensor * state_mask,
  2667. int32_t kv_head,
  2668. int32_t n_kv,
  2669. const llm_build_cb & cb,
  2670. int il) {
  2671. const llama_model & model = lctx.model;
  2672. const llama_hparams & hparams = model.hparams;
  2673. const llama_kv_cache & kv = lctx.kv_self;
  2674. const int64_t d_conv = hparams.ssm_d_conv;
  2675. const int64_t d_inner = hparams.ssm_d_inner;
  2676. const int64_t d_state = hparams.ssm_d_state;
  2677. const int64_t dt_rank = hparams.ssm_dt_rank;
  2678. const int64_t n_seqs = batch.n_seqs;
  2679. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  2680. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  2681. // Use the same RMS norm as the final layer norm
  2682. const float norm_rms_eps = hparams.f_norm_rms_eps;
  2683. const int64_t n_seq_tokens = batch.n_seq_tokens;
  2684. GGML_ASSERT(n_seqs != 0);
  2685. GGML_ASSERT(batch.equal_seqs);
  2686. GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
  2687. struct ggml_tensor * conv_states_all = kv.k_l[il];
  2688. struct ggml_tensor * ssm_states_all = kv.v_l[il];
  2689. // (ab)using the KV cache to store the states
  2690. struct ggml_tensor * conv = llm_build_copy_mask_state(ctx,
  2691. graph, conv_states_all, state_copy, state_mask,
  2692. hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs);
  2693. conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs);
  2694. struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx,
  2695. graph, ssm_states_all, state_copy, state_mask,
  2696. hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs);
  2697. ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs);
  2698. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  2699. cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs);
  2700. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  2701. struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur);
  2702. // split the above in two
  2703. // => {d_inner, n_seq_tokens, n_seqs}
  2704. struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  2705. struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  2706. // conv
  2707. {
  2708. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  2709. struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0);
  2710. // copy last (d_conv - 1) columns back into the state cache
  2711. struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  2712. ggml_build_forward_expand(graph,
  2713. ggml_cpy(ctx, last_conv,
  2714. ggml_view_1d(ctx, conv_states_all,
  2715. (d_conv - 1)*(d_inner)*(n_seqs),
  2716. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  2717. // 1D convolution
  2718. // The equivalent is to make a self-overlapping view of conv_x
  2719. // over d_conv columns at each stride in the 3rd dimension,
  2720. // then element-wise multiply that with the conv1d weight,
  2721. // then sum the elements of each row,
  2722. // (the last two steps are a dot product over rows (also doable with mul_mat))
  2723. // then permute away the ne[0] dimension,
  2724. // and then you're left with the resulting x tensor.
  2725. // For simultaneous sequences, all sequences need to have the same length.
  2726. x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
  2727. // bias
  2728. x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);
  2729. x = ggml_silu(ctx, x);
  2730. }
  2731. // ssm
  2732. {
  2733. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  2734. struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x);
  2735. // split
  2736. struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  2737. struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  2738. struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  2739. // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
  2740. if (ssm_dt_b_c_rms) {
  2741. dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
  2742. B = ggml_rms_norm(ctx, B, norm_rms_eps);
  2743. C = ggml_rms_norm(ctx, C, norm_rms_eps);
  2744. }
  2745. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  2746. dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt);
  2747. dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b);
  2748. // Custom operator to optimize the parallel associative scan
  2749. // as described in the Annex D of the Mamba paper.
  2750. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  2751. struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
  2752. // store last states
  2753. ggml_build_forward_expand(graph,
  2754. ggml_cpy(ctx,
  2755. ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
  2756. ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  2757. struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
  2758. // TODO: skip computing output earlier for unused tokens
  2759. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  2760. y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d));
  2761. y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z)));
  2762. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  2763. cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y);
  2764. }
  2765. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  2766. cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
  2767. cb(cur, "mamba_out", il);
  2768. return cur;
  2769. }
  2770. static struct ggml_tensor * llm_build_rwkv6_time_mix(
  2771. struct llama_context & lctx,
  2772. struct ggml_context * ctx,
  2773. const struct llama_layer * layer,
  2774. struct ggml_tensor * cur,
  2775. struct ggml_tensor * x_prev,
  2776. struct ggml_tensor ** wkv_state) {
  2777. size_t n_embd = cur->ne[0];
  2778. size_t n_seq_tokens = cur->ne[1];
  2779. size_t n_seqs = cur->ne[2];
  2780. size_t head_size = layer->time_mix_first->ne[0];
  2781. size_t head_count = layer->time_mix_first->ne[1];
  2782. size_t n_tokens = n_seqs * n_seq_tokens;
  2783. struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
  2784. sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
  2785. cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
  2786. struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
  2787. xxx = ggml_reshape_4d(
  2788. ctx,
  2789. ggml_tanh(
  2790. ctx,
  2791. ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
  2792. ),
  2793. layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  2794. );
  2795. xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
  2796. xxx = ggml_mul_mat(
  2797. ctx,
  2798. ggml_reshape_4d(
  2799. ctx,
  2800. layer->time_mix_w2,
  2801. layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
  2802. ),
  2803. xxx
  2804. );
  2805. struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
  2806. struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
  2807. struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
  2808. struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
  2809. struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
  2810. struct ggml_tensor * xw = ggml_add(
  2811. ctx,
  2812. ggml_mul(
  2813. ctx,
  2814. ggml_add(ctx, mw, layer->time_mix_lerp_w),
  2815. sx
  2816. ),
  2817. cur
  2818. );
  2819. struct ggml_tensor * xk = ggml_add(
  2820. ctx,
  2821. ggml_mul(
  2822. ctx,
  2823. ggml_add(ctx, mk, layer->time_mix_lerp_k),
  2824. sx
  2825. ),
  2826. cur
  2827. );
  2828. struct ggml_tensor * xv = ggml_add(
  2829. ctx,
  2830. ggml_mul(
  2831. ctx,
  2832. ggml_add(ctx, mv, layer->time_mix_lerp_v),
  2833. sx
  2834. ),
  2835. cur
  2836. );
  2837. struct ggml_tensor * xr = ggml_add(
  2838. ctx,
  2839. ggml_mul(
  2840. ctx,
  2841. ggml_add(ctx, mr, layer->time_mix_lerp_r),
  2842. sx
  2843. ),
  2844. cur
  2845. );
  2846. struct ggml_tensor * xg = ggml_add(
  2847. ctx,
  2848. ggml_mul(
  2849. ctx,
  2850. ggml_add(ctx, mg, layer->time_mix_lerp_g),
  2851. sx
  2852. ),
  2853. cur
  2854. );
  2855. struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
  2856. struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
  2857. struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
  2858. struct ggml_tensor * g = ggml_silu(
  2859. ctx,
  2860. llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
  2861. );
  2862. struct ggml_tensor * w = ggml_mul_mat(
  2863. ctx,
  2864. layer->time_mix_decay_w2,
  2865. ggml_tanh(
  2866. ctx,
  2867. ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
  2868. )
  2869. );
  2870. w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
  2871. w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
  2872. w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
  2873. k = ggml_transpose(ctx, k);
  2874. v = ggml_transpose(ctx, v);
  2875. r = ggml_transpose(ctx, r);
  2876. struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
  2877. cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
  2878. *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
  2879. // group norm with head_count groups
  2880. cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
  2881. cur = ggml_norm(ctx, cur, 64e-5f);
  2882. // Convert back to regular vectors.
  2883. cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
  2884. cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
  2885. cur = ggml_mul(ctx, cur, g);
  2886. cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
  2887. return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
  2888. }
  2889. static struct ggml_tensor * llm_build_rwkv6_channel_mix(
  2890. struct llama_context & lctx,
  2891. struct ggml_context * ctx,
  2892. const struct llama_layer * layer,
  2893. struct ggml_tensor * cur,
  2894. struct ggml_tensor * x_prev) {
  2895. struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
  2896. struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
  2897. struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
  2898. struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
  2899. struct ggml_tensor * k = ggml_sqr(
  2900. ctx,
  2901. ggml_relu(
  2902. ctx,
  2903. llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
  2904. )
  2905. );
  2906. return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
  2907. }
  2908. // block of KV slots to move when defragging
  2909. struct llama_kv_defrag_move {
  2910. uint32_t src;
  2911. uint32_t dst;
  2912. uint32_t len;
  2913. };
  2914. struct llm_build_context {
  2915. const llama_model & model;
  2916. llama_context & lctx;
  2917. const llama_hparams & hparams;
  2918. const llama_cparams & cparams;
  2919. const llama_ubatch & ubatch;
  2920. const llama_kv_cache & kv_self;
  2921. const int64_t n_embd;
  2922. const int64_t n_layer;
  2923. const int64_t n_rot;
  2924. const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
  2925. const int64_t n_head;
  2926. const int64_t n_head_kv;
  2927. const int64_t n_embd_head_k;
  2928. const int64_t n_embd_k_gqa;
  2929. const int64_t n_embd_head_v;
  2930. const int64_t n_embd_v_gqa;
  2931. const int64_t n_expert;
  2932. const int64_t n_expert_used;
  2933. const float freq_base;
  2934. const float freq_scale;
  2935. const float ext_factor;
  2936. const float attn_factor;
  2937. const float beta_fast;
  2938. const float beta_slow;
  2939. const float norm_eps;
  2940. const float norm_rms_eps;
  2941. const int32_t n_tokens;
  2942. const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
  2943. const int32_t n_outputs;
  2944. const int32_t n_outputs_enc;
  2945. const int32_t kv_head; // index of where we store new KV data in the cache
  2946. const int32_t n_ctx_orig;
  2947. const bool flash_attn;
  2948. const enum llama_pooling_type pooling_type;
  2949. const enum llama_rope_type rope_type;
  2950. const llm_build_cb & cb;
  2951. std::vector<uint8_t> & buf_compute_meta;
  2952. struct ggml_context * ctx0 = nullptr;
  2953. // TODO: consider making the entire interface noexcept
  2954. llm_build_context(
  2955. llama_context & lctx,
  2956. const llama_ubatch & ubatch,
  2957. const llm_build_cb & cb,
  2958. bool worst_case) :
  2959. model (lctx.model),
  2960. lctx (lctx),
  2961. hparams (model.hparams),
  2962. cparams (lctx.cparams),
  2963. ubatch (ubatch),
  2964. kv_self (lctx.kv_self),
  2965. n_embd (hparams.n_embd),
  2966. n_layer (hparams.n_layer),
  2967. n_rot (hparams.n_rot),
  2968. n_ctx (cparams.n_ctx),
  2969. n_head (hparams.n_head()),
  2970. n_head_kv (hparams.n_head_kv()),
  2971. n_embd_head_k (hparams.n_embd_head_k),
  2972. n_embd_k_gqa (hparams.n_embd_k_gqa()),
  2973. n_embd_head_v (hparams.n_embd_head_v),
  2974. n_embd_v_gqa (hparams.n_embd_v_gqa()),
  2975. n_expert (hparams.n_expert),
  2976. n_expert_used (hparams.n_expert_used),
  2977. freq_base (cparams.rope_freq_base),
  2978. freq_scale (cparams.rope_freq_scale),
  2979. ext_factor (cparams.yarn_ext_factor),
  2980. attn_factor (cparams.yarn_attn_factor),
  2981. beta_fast (cparams.yarn_beta_fast),
  2982. beta_slow (cparams.yarn_beta_slow),
  2983. norm_eps (hparams.f_norm_eps),
  2984. norm_rms_eps (hparams.f_norm_rms_eps),
  2985. n_tokens (ubatch.n_tokens),
  2986. n_kv (worst_case ? kv_self.size : kv_self.n),
  2987. n_outputs (worst_case ? n_tokens : lctx.n_outputs),
  2988. n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
  2989. kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
  2990. n_ctx_orig (cparams.n_ctx_orig_yarn),
  2991. flash_attn (cparams.flash_attn),
  2992. pooling_type (cparams.pooling_type),
  2993. rope_type (hparams.rope_type),
  2994. cb (cb),
  2995. buf_compute_meta (lctx.buf_compute_meta) {
  2996. // all initializations should be done in init()
  2997. }
  2998. void init() {
  2999. struct ggml_init_params params = {
  3000. /*.mem_size =*/ buf_compute_meta.size(),
  3001. /*.mem_buffer =*/ buf_compute_meta.data(),
  3002. /*.no_alloc =*/ true,
  3003. };
  3004. ctx0 = ggml_init(params);
  3005. lctx.inp_tokens = nullptr;
  3006. lctx.inp_embd = nullptr;
  3007. lctx.inp_pos = nullptr;
  3008. lctx.inp_out_ids = nullptr;
  3009. lctx.inp_KQ_mask = nullptr;
  3010. lctx.inp_KQ_mask_swa = nullptr;
  3011. lctx.inp_K_shift = nullptr;
  3012. lctx.inp_mean = nullptr;
  3013. lctx.inp_cls = nullptr;
  3014. lctx.inp_s_copy = nullptr;
  3015. lctx.inp_s_mask = nullptr;
  3016. lctx.inp_s_seq = nullptr;
  3017. lctx.inp_pos_bucket = nullptr;
  3018. lctx.inp_embd_enc = nullptr;
  3019. lctx.inp_KQ_mask_cross = nullptr;
  3020. lctx.inp_cross_attn_state = nullptr;
  3021. }
  3022. void free() {
  3023. ggml_free(ctx0);
  3024. ctx0 = nullptr;
  3025. }
  3026. struct ggml_cgraph * build_k_shift() {
  3027. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3028. GGML_ASSERT(kv_self.size == n_ctx);
  3029. lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
  3030. cb(lctx.inp_K_shift, "K_shift", -1);
  3031. ggml_set_input(lctx.inp_K_shift);
  3032. for (int il = 0; il < n_layer; ++il) {
  3033. const int64_t n_head_kv = hparams.n_head_kv(il);
  3034. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  3035. struct ggml_tensor * rope_factors = build_rope_factors(il);
  3036. struct ggml_tensor * k =
  3037. ggml_view_3d(ctx0, kv_self.k_l[il],
  3038. n_embd_head_k, n_head_kv, n_ctx,
  3039. ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
  3040. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  3041. 0);
  3042. struct ggml_tensor * tmp;
  3043. if (ggml_is_quantized(k->type)) {
  3044. // dequantize to f32 -> RoPE -> quantize back
  3045. tmp = ggml_cast(ctx0, k, GGML_TYPE_F32);
  3046. cb(tmp, "K_f32", il);
  3047. for (auto & backend : lctx.backends) {
  3048. // Figure out which backend KV cache belongs to
  3049. if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) {
  3050. ggml_backend_sched_set_tensor_backend(lctx.sched.get(), tmp, backend.get());
  3051. break;
  3052. }
  3053. }
  3054. tmp = ggml_rope_ext_inplace(ctx0, tmp,
  3055. lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3056. ext_factor, attn_factor, beta_fast, beta_slow);
  3057. cb(tmp, "K_shifted_f32", il);
  3058. tmp = ggml_cpy(ctx0, tmp, k);
  3059. } else {
  3060. // we rotate only the first n_rot dimensions
  3061. tmp = ggml_rope_ext_inplace(ctx0, k,
  3062. lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3063. ext_factor, attn_factor, beta_fast, beta_slow);
  3064. }
  3065. cb(tmp, "K_shifted", il);
  3066. ggml_build_forward_expand(gf, tmp);
  3067. }
  3068. return gf;
  3069. }
  3070. struct ggml_cgraph * build_defrag(const std::vector<struct llama_kv_defrag_move> & moves) {
  3071. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3072. for (const auto & move : moves) {
  3073. for (int il = 0; il < n_layer; ++il) {
  3074. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  3075. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  3076. ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
  3077. n_embd_k_gqa, move.len,
  3078. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  3079. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.src));
  3080. ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
  3081. n_embd_k_gqa, move.len,
  3082. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  3083. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*move.dst));
  3084. ggml_tensor * view_v_src;
  3085. ggml_tensor * view_v_dst;
  3086. if (flash_attn) {
  3087. // NOTE: the V cache is not transposed when using flash attention
  3088. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  3089. n_embd_v_gqa, move.len,
  3090. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  3091. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.src));
  3092. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  3093. n_embd_v_gqa, move.len,
  3094. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  3095. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*move.dst));
  3096. } else {
  3097. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  3098. move.len, n_embd_v_gqa,
  3099. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  3100. ggml_row_size(kv_self.v_l[il]->type, move.src));
  3101. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  3102. move.len, n_embd_v_gqa,
  3103. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  3104. ggml_row_size(kv_self.v_l[il]->type, move.dst));
  3105. }
  3106. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
  3107. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
  3108. }
  3109. }
  3110. //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
  3111. return gf;
  3112. }
  3113. struct ggml_tensor * build_inp_pos() {
  3114. lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  3115. cb(lctx.inp_pos, "inp_pos", -1);
  3116. ggml_set_input(lctx.inp_pos);
  3117. return lctx.inp_pos;
  3118. }
  3119. struct ggml_tensor * build_rope_factors(int il) {
  3120. // choose long/short freq factors based on the context size
  3121. const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
  3122. if (model.layers[il].rope_freqs != nullptr) {
  3123. return model.layers[il].rope_freqs;
  3124. }
  3125. if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
  3126. return model.layers[il].rope_long;
  3127. }
  3128. return model.layers[il].rope_short;
  3129. }
  3130. struct ggml_tensor * build_inp_out_ids() {
  3131. lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
  3132. cb(lctx.inp_out_ids, "inp_out_ids", -1);
  3133. ggml_set_input(lctx.inp_out_ids);
  3134. return lctx.inp_out_ids;
  3135. }
  3136. struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
  3137. lctx.inp_KQ_mask = causal
  3138. ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
  3139. : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  3140. cb(lctx.inp_KQ_mask, "KQ_mask", -1);
  3141. ggml_set_input(lctx.inp_KQ_mask);
  3142. return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
  3143. }
  3144. struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
  3145. GGML_ASSERT(hparams.n_swa > 0);
  3146. lctx.inp_KQ_mask_swa = causal
  3147. ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
  3148. : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  3149. cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
  3150. ggml_set_input(lctx.inp_KQ_mask_swa);
  3151. return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
  3152. }
  3153. struct ggml_tensor * build_inp_mean() {
  3154. lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
  3155. cb(lctx.inp_mean, "inp_mean", -1);
  3156. ggml_set_input(lctx.inp_mean);
  3157. return lctx.inp_mean;
  3158. }
  3159. struct ggml_tensor * build_inp_cls() {
  3160. lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  3161. cb(lctx.inp_cls, "inp_cls", -1);
  3162. ggml_set_input(lctx.inp_cls);
  3163. return lctx.inp_cls;
  3164. }
  3165. struct ggml_tensor * build_inp_s_copy() {
  3166. lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
  3167. cb(lctx.inp_s_copy, "inp_s_copy", -1);
  3168. ggml_set_input(lctx.inp_s_copy);
  3169. return lctx.inp_s_copy;
  3170. }
  3171. struct ggml_tensor * build_inp_s_mask() {
  3172. lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
  3173. cb(lctx.inp_s_mask, "inp_s_mask", -1);
  3174. ggml_set_input(lctx.inp_s_mask);
  3175. return lctx.inp_s_mask;
  3176. }
  3177. struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
  3178. // find result_norm tensor for input
  3179. struct ggml_tensor * inp = nullptr;
  3180. for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
  3181. inp = ggml_graph_node(gf, i);
  3182. if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
  3183. break;
  3184. } else {
  3185. inp = nullptr;
  3186. }
  3187. }
  3188. GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
  3189. struct ggml_tensor * cur;
  3190. switch (pooling_type) {
  3191. case LLAMA_POOLING_TYPE_NONE:
  3192. {
  3193. cur = inp;
  3194. } break;
  3195. case LLAMA_POOLING_TYPE_MEAN:
  3196. {
  3197. struct ggml_tensor * inp_mean = build_inp_mean();
  3198. cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
  3199. } break;
  3200. case LLAMA_POOLING_TYPE_CLS:
  3201. case LLAMA_POOLING_TYPE_LAST:
  3202. {
  3203. struct ggml_tensor * inp_cls = build_inp_cls();
  3204. cur = ggml_get_rows(ctx0, inp, inp_cls);
  3205. } break;
  3206. case LLAMA_POOLING_TYPE_RANK:
  3207. {
  3208. struct ggml_tensor * inp_cls = build_inp_cls();
  3209. inp = ggml_get_rows(ctx0, inp, inp_cls);
  3210. // classification head
  3211. // https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
  3212. GGML_ASSERT(model.cls != nullptr);
  3213. GGML_ASSERT(model.cls_b != nullptr);
  3214. cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls, inp), model.cls_b);
  3215. cur = ggml_tanh(ctx0, cur);
  3216. // some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
  3217. // https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
  3218. if (model.cls_out) {
  3219. GGML_ASSERT(model.cls_out_b != nullptr);
  3220. cur = ggml_add (ctx0, ggml_mul_mat(ctx0, model.cls_out, cur), model.cls_out_b);
  3221. }
  3222. } break;
  3223. default:
  3224. {
  3225. GGML_ABORT("unknown pooling type");
  3226. }
  3227. }
  3228. cb(cur, "result_embd_pooled", -1);
  3229. ggml_build_forward_expand(gf, cur);
  3230. return gf;
  3231. }
  3232. struct ggml_tensor * llm_build_pos_bucket(bool causal) {
  3233. if (causal) {
  3234. lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
  3235. } else {
  3236. lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
  3237. }
  3238. ggml_set_input(lctx.inp_pos_bucket);
  3239. cb(lctx.inp_pos_bucket, "pos_bucket", -1);
  3240. return lctx.inp_pos_bucket;
  3241. }
  3242. struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
  3243. struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
  3244. cb(pos_bucket_1d, "pos_bucket_1d", -1);
  3245. struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
  3246. cb(pos_bias, "pos_bias", -1);
  3247. pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0);
  3248. cb(pos_bias, "pos_bias", -1);
  3249. pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
  3250. cb(pos_bias, "pos_bias", -1);
  3251. pos_bias = ggml_cont(ctx0, pos_bias);
  3252. cb(pos_bias, "pos_bias", -1);
  3253. return pos_bias;
  3254. }
  3255. struct ggml_tensor * llm_build_inp_embd_enc() {
  3256. const int64_t n_embd = hparams.n_embd;
  3257. lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
  3258. ggml_set_input(lctx.inp_embd_enc);
  3259. cb(lctx.inp_embd_enc, "embd_enc", -1);
  3260. return lctx.inp_embd_enc;
  3261. }
  3262. struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
  3263. lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  3264. ggml_set_input(lctx.inp_KQ_mask_cross);
  3265. cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
  3266. return lctx.inp_KQ_mask_cross;
  3267. }
  3268. struct ggml_cgraph * build_llama() {
  3269. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3270. // mutable variable, needed during the last layer of the computation to skip unused tokens
  3271. int32_t n_tokens = this->n_tokens;
  3272. const int64_t n_embd_head = hparams.n_embd_head_v;
  3273. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3274. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3275. struct ggml_tensor * cur;
  3276. struct ggml_tensor * inpL;
  3277. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3278. // inp_pos - contains the positions
  3279. struct ggml_tensor * inp_pos = build_inp_pos();
  3280. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3281. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3282. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  3283. for (int il = 0; il < n_layer; ++il) {
  3284. struct ggml_tensor * inpSA = inpL;
  3285. // norm
  3286. cur = llm_build_norm(ctx0, inpL, hparams,
  3287. model.layers[il].attn_norm, NULL,
  3288. LLM_NORM_RMS, cb, il);
  3289. cb(cur, "attn_norm", il);
  3290. // self-attention
  3291. {
  3292. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3293. struct ggml_tensor * rope_factors = build_rope_factors(il);
  3294. // compute Q and K and RoPE them
  3295. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3296. cb(Qcur, "Qcur", il);
  3297. if (model.layers[il].bq) {
  3298. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3299. cb(Qcur, "Qcur", il);
  3300. }
  3301. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  3302. cb(Kcur, "Kcur", il);
  3303. if (model.layers[il].bk) {
  3304. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3305. cb(Kcur, "Kcur", il);
  3306. }
  3307. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  3308. cb(Vcur, "Vcur", il);
  3309. if (model.layers[il].bv) {
  3310. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3311. cb(Vcur, "Vcur", il);
  3312. }
  3313. Qcur = ggml_rope_ext(
  3314. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  3315. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3316. ext_factor, attn_factor, beta_fast, beta_slow
  3317. );
  3318. cb(Qcur, "Qcur", il);
  3319. Kcur = ggml_rope_ext(
  3320. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  3321. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3322. ext_factor, attn_factor, beta_fast, beta_slow
  3323. );
  3324. cb(Kcur, "Kcur", il);
  3325. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3326. model.layers[il].wo, model.layers[il].bo,
  3327. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  3328. }
  3329. if (il == n_layer - 1) {
  3330. // skip computing output for unused tokens
  3331. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3332. n_tokens = n_outputs;
  3333. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3334. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3335. }
  3336. // For Granite architecture
  3337. if (hparams.f_residual_scale) {
  3338. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3339. }
  3340. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3341. cb(ffn_inp, "ffn_inp", il);
  3342. // feed-forward network
  3343. if (model.layers[il].ffn_gate_inp == nullptr) {
  3344. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3345. model.layers[il].ffn_norm, NULL,
  3346. LLM_NORM_RMS, cb, il);
  3347. cb(cur, "ffn_norm", il);
  3348. cur = llm_build_ffn(ctx0, lctx, cur,
  3349. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3350. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3351. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3352. NULL,
  3353. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3354. cb(cur, "ffn_out", il);
  3355. } else {
  3356. // MoE branch
  3357. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3358. model.layers[il].ffn_norm, NULL,
  3359. LLM_NORM_RMS, cb, il);
  3360. cb(cur, "ffn_norm", il);
  3361. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  3362. model.layers[il].ffn_gate_inp,
  3363. model.layers[il].ffn_up_exps,
  3364. model.layers[il].ffn_gate_exps,
  3365. model.layers[il].ffn_down_exps,
  3366. nullptr,
  3367. n_expert, n_expert_used,
  3368. LLM_FFN_SILU, true,
  3369. false, 0.0,
  3370. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  3371. cb, il);
  3372. cb(cur, "ffn_moe_out", il);
  3373. }
  3374. // For Granite architecture
  3375. if (hparams.f_residual_scale) {
  3376. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3377. }
  3378. cur = ggml_add(ctx0, cur, ffn_inp);
  3379. cb(cur, "ffn_out", il);
  3380. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3381. cb(cur, "l_out", il);
  3382. // input for next layer
  3383. inpL = cur;
  3384. }
  3385. cur = inpL;
  3386. cur = llm_build_norm(ctx0, cur, hparams,
  3387. model.output_norm, NULL,
  3388. LLM_NORM_RMS, cb, -1);
  3389. cb(cur, "result_norm", -1);
  3390. // lm_head
  3391. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3392. // For Granite architecture
  3393. if (hparams.f_logit_scale) {
  3394. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  3395. }
  3396. cb(cur, "result_output", -1);
  3397. ggml_build_forward_expand(gf, cur);
  3398. return gf;
  3399. }
  3400. struct ggml_cgraph * build_mllama() {
  3401. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3402. // mutable variable, needed during the last layer of the computation to skip unused tokens
  3403. int32_t n_tokens = this->n_tokens;
  3404. const int64_t n_embd_head = hparams.n_embd_head_v;
  3405. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3406. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3407. struct ggml_tensor * cur;
  3408. struct ggml_tensor * inpL;
  3409. struct ggml_tensor * inpCAS;
  3410. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3411. inpCAS = llm_build_inp_cross_attn_state(ctx0, lctx, hparams, cb);
  3412. // inp_pos - contains the positions
  3413. struct ggml_tensor * inp_pos = build_inp_pos();
  3414. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3415. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3416. for (int il = 0; il < n_layer; ++il) {
  3417. struct ggml_tensor * inpSA = inpL;
  3418. // norm
  3419. cur = llm_build_norm(ctx0, inpL, hparams,
  3420. model.layers[il].attn_norm, NULL,
  3421. LLM_NORM_RMS, cb, il);
  3422. cb(cur, "attn_norm", il);
  3423. if (hparams.cross_attention_layers(il)) {
  3424. if (!ubatch.embd && !cparams.cross_attn) {
  3425. continue;
  3426. }
  3427. // cross attention layer
  3428. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_q_proj, cur);
  3429. cb(Qcur, "Qcur", il);
  3430. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3431. cb(Qcur, "Qcur", il);
  3432. Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 2, 1, 3));
  3433. cb(Qcur, "Qcur", il);
  3434. Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].cross_attn_q_norm, NULL, LLM_NORM_RMS, cb, il);
  3435. cb(Qcur, "Qcur", il);
  3436. struct ggml_tensor * Kcur, * Vcur;
  3437. if (ubatch.embd) {
  3438. Kcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_k_proj, inpCAS);
  3439. cb(Kcur, "Kcur", il);
  3440. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, 6404);
  3441. cb(Kcur, "Kcur", il);
  3442. Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  3443. cb(Kcur, "Kcur", il);
  3444. Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].cross_attn_k_norm, NULL, LLM_NORM_RMS, cb, il);
  3445. cb(Kcur, "Kcur", il);
  3446. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, kv_self.k_l[il]));
  3447. Vcur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_v_proj, inpCAS);
  3448. cb(Vcur, "Vcur", il);
  3449. Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, 6404);
  3450. cb(Vcur, "Vcur", il);
  3451. Vcur = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
  3452. cb(Vcur, "Vcur", il);
  3453. ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, kv_self.v_l[il]));
  3454. } else {
  3455. Kcur = ggml_view_tensor(ctx0, kv_self.k_l[il]);
  3456. cb(Kcur, "Kcur (view)", il);
  3457. Vcur = ggml_view_tensor(ctx0, kv_self.v_l[il]);
  3458. cb(Vcur, "Vcur (view)", il);
  3459. }
  3460. struct ggml_tensor * kq = ggml_mul_mat(ctx0, Kcur, Qcur);
  3461. cb(kq, "kq", il);
  3462. // TODO: apply causal masks
  3463. struct ggml_tensor * kq_soft_max = ggml_soft_max_ext(ctx0, kq, nullptr, 1.f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
  3464. cb(kq_soft_max, "kq_soft_max", il);
  3465. Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, Vcur));
  3466. cb(Vcur, "Vcur", il);
  3467. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, Vcur, kq_soft_max);
  3468. cb(kqv, "kqv", il);
  3469. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  3470. cb(kqv_merged, "kqv_merged", il);
  3471. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
  3472. cb(cur, "kqv_merged_cont", il);
  3473. cur = ggml_mul_mat(ctx0, model.layers[il].cross_attn_o_proj, cur);
  3474. cb(cur, "cur", il);
  3475. // TODO: do this in place once?
  3476. cur = ggml_mul(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_attn_gate));
  3477. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3478. cb(ffn_inp, "ffn_inp", il);
  3479. // feed-forward network
  3480. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3481. model.layers[il].ffn_norm, NULL,
  3482. LLM_NORM_RMS, cb, il);
  3483. cb(cur, "ffn_norm", il);
  3484. cur = llm_build_ffn(ctx0, lctx, cur,
  3485. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3486. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3487. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3488. NULL,
  3489. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3490. cb(cur, "ffn_out", il);
  3491. // TODO: do this inplace once?
  3492. cur = ggml_add_inplace(ctx0, ggml_mul_inplace(ctx0, cur, ggml_tanh(ctx0, model.layers[il].cross_attn_mlp_gate)), ffn_inp);
  3493. cb(cur, "ffn_out", il);
  3494. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3495. cb(cur, "l_out", il);
  3496. // input for next layer
  3497. inpL = cur;
  3498. } else {
  3499. // self attention layer
  3500. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3501. struct ggml_tensor * rope_factors = build_rope_factors(il);
  3502. // compute Q and K and RoPE them
  3503. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3504. cb(Qcur, "Qcur", il);
  3505. if (model.layers[il].bq) {
  3506. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3507. cb(Qcur, "Qcur", il);
  3508. }
  3509. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  3510. cb(Kcur, "Kcur", il);
  3511. if (model.layers[il].bk) {
  3512. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3513. cb(Kcur, "Kcur", il);
  3514. }
  3515. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  3516. cb(Vcur, "Vcur", il);
  3517. if (model.layers[il].bv) {
  3518. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3519. cb(Vcur, "Vcur", il);
  3520. }
  3521. Qcur = ggml_rope_ext(
  3522. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  3523. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3524. ext_factor, attn_factor, beta_fast, beta_slow
  3525. );
  3526. cb(Qcur, "Qcur", il);
  3527. Kcur = ggml_rope_ext(
  3528. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  3529. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3530. ext_factor, attn_factor, beta_fast, beta_slow
  3531. );
  3532. cb(Kcur, "Kcur", il);
  3533. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3534. model.layers[il].wo, model.layers[il].bo,
  3535. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3536. if (il == n_layer - 1) {
  3537. // skip computing output for unused tokens
  3538. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3539. n_tokens = n_outputs;
  3540. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3541. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3542. }
  3543. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3544. cb(ffn_inp, "ffn_inp", il);
  3545. // feed-forward network
  3546. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3547. model.layers[il].ffn_norm, NULL,
  3548. LLM_NORM_RMS, cb, il);
  3549. cb(cur, "ffn_norm", il);
  3550. cur = llm_build_ffn(ctx0, lctx, cur,
  3551. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3552. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3553. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3554. NULL,
  3555. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3556. cb(cur, "ffn_out", il);
  3557. cur = ggml_add(ctx0, cur, ffn_inp);
  3558. cb(cur, "ffn_out", il);
  3559. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3560. cb(cur, "l_out", il);
  3561. // input for next layer
  3562. inpL = cur;
  3563. }
  3564. }
  3565. cur = inpL;
  3566. cur = llm_build_norm(ctx0, cur, hparams,
  3567. model.output_norm, NULL,
  3568. LLM_NORM_RMS, cb, -1);
  3569. cb(cur, "result_norm", -1);
  3570. // lm_head
  3571. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3572. cb(cur, "result_output", -1);
  3573. ggml_build_forward_expand(gf, cur);
  3574. return gf;
  3575. }
  3576. struct ggml_cgraph * build_deci() {
  3577. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3578. // mutable variable, needed during the last layer of the computation to skip unused tokens
  3579. int32_t n_tokens = this->n_tokens;
  3580. const int64_t n_embd_head = hparams.n_embd_head_v;
  3581. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3582. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3583. struct ggml_tensor * cur;
  3584. struct ggml_tensor * inpL;
  3585. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3586. // inp_pos - contains the positions
  3587. struct ggml_tensor * inp_pos = build_inp_pos();
  3588. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3589. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3590. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  3591. for (int il = 0; il < n_layer; ++il) {
  3592. struct ggml_tensor * inpSA = inpL;
  3593. const int64_t n_head_kv = hparams.n_head_kv(il);
  3594. const int64_t n_head = hparams.n_head(il);
  3595. if (n_head == 0) {
  3596. // attention-free layer of Llama-3_1-Nemotron-51B
  3597. cur = inpL;
  3598. } else {
  3599. // norm
  3600. cur = llm_build_norm(ctx0, inpL, hparams,
  3601. model.layers[il].attn_norm, NULL,
  3602. LLM_NORM_RMS, cb, il);
  3603. cb(cur, "attn_norm", il);
  3604. }
  3605. if (n_head > 0 && n_head_kv == 0) {
  3606. // "linear attention" of Llama-3_1-Nemotron-51B
  3607. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  3608. cb(cur, "wo", il);
  3609. } else if (n_head > 0) {
  3610. // self-attention
  3611. // rope freq factors for llama3; may return nullptr for llama2 and other models
  3612. struct ggml_tensor * rope_factors = build_rope_factors(il);
  3613. // compute Q and K and RoPE them
  3614. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3615. cb(Qcur, "Qcur", il);
  3616. if (model.layers[il].bq) {
  3617. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3618. cb(Qcur, "Qcur", il);
  3619. }
  3620. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  3621. cb(Kcur, "Kcur", il);
  3622. if (model.layers[il].bk) {
  3623. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  3624. cb(Kcur, "Kcur", il);
  3625. }
  3626. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  3627. cb(Vcur, "Vcur", il);
  3628. if (model.layers[il].bv) {
  3629. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  3630. cb(Vcur, "Vcur", il);
  3631. }
  3632. Qcur = ggml_rope_ext(
  3633. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  3634. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3635. ext_factor, attn_factor, beta_fast, beta_slow
  3636. );
  3637. cb(Qcur, "Qcur", il);
  3638. Kcur = ggml_rope_ext(
  3639. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  3640. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3641. ext_factor, attn_factor, beta_fast, beta_slow
  3642. );
  3643. cb(Kcur, "Kcur", il);
  3644. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3645. model.layers[il].wo, model.layers[il].bo,
  3646. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  3647. }
  3648. if (il == n_layer - 1) {
  3649. // skip computing output for unused tokens
  3650. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3651. n_tokens = n_outputs;
  3652. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3653. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3654. }
  3655. // For Granite architecture
  3656. if (hparams.f_residual_scale) {
  3657. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3658. }
  3659. // modified to support attention-free layer of Llama-3_1-Nemotron-51B
  3660. struct ggml_tensor * ffn_inp = cur;
  3661. if (n_head > 0) {
  3662. ffn_inp = ggml_add(ctx0, cur, inpSA);
  3663. cb(ffn_inp, "ffn_inp", il);
  3664. }
  3665. // feed-forward network
  3666. if (model.layers[il].ffn_gate_inp == nullptr) {
  3667. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3668. model.layers[il].ffn_norm, NULL,
  3669. LLM_NORM_RMS, cb, il);
  3670. cb(cur, "ffn_norm", il);
  3671. cur = llm_build_ffn(ctx0, lctx, cur,
  3672. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  3673. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  3674. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  3675. NULL,
  3676. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3677. cb(cur, "ffn_out", il);
  3678. }
  3679. // For Granite architecture
  3680. if (hparams.f_residual_scale) {
  3681. cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
  3682. }
  3683. cur = ggml_add(ctx0, cur, ffn_inp);
  3684. cb(cur, "ffn_out", il);
  3685. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3686. cb(cur, "l_out", il);
  3687. // input for next layer
  3688. inpL = cur;
  3689. }
  3690. cur = inpL;
  3691. cur = llm_build_norm(ctx0, cur, hparams,
  3692. model.output_norm, NULL,
  3693. LLM_NORM_RMS, cb, -1);
  3694. cb(cur, "result_norm", -1);
  3695. // lm_head
  3696. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3697. // For Granite architecture
  3698. if (hparams.f_logit_scale) {
  3699. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale);
  3700. }
  3701. cb(cur, "result_output", -1);
  3702. ggml_build_forward_expand(gf, cur);
  3703. return gf;
  3704. }
  3705. struct ggml_cgraph * build_baichuan() {
  3706. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3707. const int64_t n_embd_head = hparams.n_embd_head_v;
  3708. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3709. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3710. struct ggml_tensor * cur;
  3711. struct ggml_tensor * inpL;
  3712. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3713. // inp_pos - contains the positions
  3714. struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
  3715. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3716. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3717. for (int il = 0; il < n_layer; ++il) {
  3718. struct ggml_tensor * inpSA = inpL;
  3719. cur = llm_build_norm(ctx0, inpL, hparams,
  3720. model.layers[il].attn_norm, NULL,
  3721. LLM_NORM_RMS, cb, il);
  3722. cb(cur, "attn_norm", il);
  3723. // self-attention
  3724. {
  3725. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3726. cb(Qcur, "Qcur", il);
  3727. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  3728. cb(Kcur, "Kcur", il);
  3729. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  3730. cb(Vcur, "Vcur", il);
  3731. switch (model.type) {
  3732. case MODEL_7B:
  3733. Qcur = ggml_rope_ext(
  3734. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  3735. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3736. ext_factor, attn_factor, beta_fast, beta_slow
  3737. );
  3738. Kcur = ggml_rope_ext(
  3739. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  3740. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3741. ext_factor, attn_factor, beta_fast, beta_slow
  3742. );
  3743. break;
  3744. case MODEL_13B:
  3745. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
  3746. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
  3747. break;
  3748. default:
  3749. GGML_ABORT("fatal error");
  3750. }
  3751. cb(Qcur, "Qcur", il);
  3752. cb(Kcur, "Kcur", il);
  3753. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3754. model.layers[il].wo, NULL,
  3755. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3756. }
  3757. if (il == n_layer - 1) {
  3758. // skip computing output for unused tokens
  3759. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3760. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3761. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3762. }
  3763. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3764. cb(ffn_inp, "ffn_inp", il);
  3765. // feed-forward network
  3766. {
  3767. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3768. model.layers[il].ffn_norm, NULL,
  3769. LLM_NORM_RMS, cb, il);
  3770. cb(cur, "ffn_norm", il);
  3771. cur = llm_build_ffn(ctx0, lctx, cur,
  3772. model.layers[il].ffn_up, NULL, NULL,
  3773. model.layers[il].ffn_gate, NULL, NULL,
  3774. model.layers[il].ffn_down, NULL, NULL,
  3775. NULL,
  3776. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3777. cb(cur, "ffn_out", il);
  3778. }
  3779. cur = ggml_add(ctx0, cur, ffn_inp);
  3780. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3781. cb(cur, "l_out", il);
  3782. // input for next layer
  3783. inpL = cur;
  3784. }
  3785. cur = inpL;
  3786. cur = llm_build_norm(ctx0, cur, hparams,
  3787. model.output_norm, NULL,
  3788. LLM_NORM_RMS, cb, -1);
  3789. cb(cur, "result_norm", -1);
  3790. // lm_head
  3791. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3792. cb(cur, "result_output", -1);
  3793. ggml_build_forward_expand(gf, cur);
  3794. return gf;
  3795. }
  3796. struct ggml_cgraph * build_xverse() {
  3797. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3798. const int64_t n_embd_head = hparams.n_embd_head_v;
  3799. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3800. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3801. struct ggml_tensor * cur;
  3802. struct ggml_tensor * inpL;
  3803. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3804. // inp_pos - contains the positions
  3805. struct ggml_tensor * inp_pos = build_inp_pos();
  3806. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3807. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3808. for (int il = 0; il < n_layer; ++il) {
  3809. struct ggml_tensor * inpSA = inpL;
  3810. cur = llm_build_norm(ctx0, inpL, hparams,
  3811. model.layers[il].attn_norm, NULL,
  3812. LLM_NORM_RMS, cb, il);
  3813. cb(cur, "attn_norm", il);
  3814. // self-attention
  3815. {
  3816. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3817. cb(Qcur, "Qcur", il);
  3818. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  3819. cb(Kcur, "Kcur", il);
  3820. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  3821. cb(Vcur, "Vcur", il);
  3822. Qcur = ggml_rope_ext(
  3823. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  3824. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3825. ext_factor, attn_factor, beta_fast, beta_slow
  3826. );
  3827. cb(Qcur, "Qcur", il);
  3828. Kcur = ggml_rope_ext(
  3829. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  3830. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  3831. ext_factor, attn_factor, beta_fast, beta_slow
  3832. );
  3833. cb(Kcur, "Kcur", il);
  3834. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3835. model.layers[il].wo, NULL,
  3836. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3837. }
  3838. if (il == n_layer - 1) {
  3839. // skip computing output for unused tokens
  3840. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3841. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3842. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  3843. }
  3844. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  3845. cb(ffn_inp, "ffn_inp", il);
  3846. // feed-forward network
  3847. {
  3848. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  3849. model.layers[il].ffn_norm, NULL,
  3850. LLM_NORM_RMS, cb, il);
  3851. cb(cur, "ffn_norm", il);
  3852. cur = llm_build_ffn(ctx0, lctx, cur,
  3853. model.layers[il].ffn_up, NULL, NULL,
  3854. model.layers[il].ffn_gate, NULL, NULL,
  3855. model.layers[il].ffn_down, NULL, NULL,
  3856. NULL,
  3857. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  3858. cb(cur, "ffn_out", il);
  3859. }
  3860. cur = ggml_add(ctx0, cur, ffn_inp);
  3861. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3862. cb(cur, "l_out", il);
  3863. // input for next layer
  3864. inpL = cur;
  3865. }
  3866. cur = inpL;
  3867. cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
  3868. cb(cur, "result_norm", -1);
  3869. // lm_head
  3870. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3871. cb(cur, "result_output", -1);
  3872. ggml_build_forward_expand(gf, cur);
  3873. return gf;
  3874. }
  3875. struct ggml_cgraph * build_falcon() {
  3876. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3877. const int64_t n_embd_head = hparams.n_embd_head_v;
  3878. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  3879. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3880. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3881. struct ggml_tensor * cur;
  3882. struct ggml_tensor * inpL;
  3883. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3884. // inp_pos - contains the positions
  3885. struct ggml_tensor * inp_pos = build_inp_pos();
  3886. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3887. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3888. for (int il = 0; il < n_layer; ++il) {
  3889. struct ggml_tensor * attn_norm;
  3890. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  3891. model.layers[il].attn_norm,
  3892. model.layers[il].attn_norm_b,
  3893. LLM_NORM, cb, il);
  3894. cb(attn_norm, "attn_norm", il);
  3895. // self-attention
  3896. {
  3897. if (model.layers[il].attn_norm_2) {
  3898. // Falcon-40B
  3899. cur = llm_build_norm(ctx0, inpL, hparams,
  3900. model.layers[il].attn_norm_2,
  3901. model.layers[il].attn_norm_2_b,
  3902. LLM_NORM, cb, il);
  3903. cb(cur, "attn_norm_2", il);
  3904. } else {
  3905. cur = attn_norm;
  3906. }
  3907. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  3908. cb(cur, "wqkv", il);
  3909. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  3910. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  3911. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  3912. cb(Qcur, "Qcur", il);
  3913. cb(Kcur, "Kcur", il);
  3914. cb(Vcur, "Vcur", il);
  3915. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  3916. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  3917. // using mode = 2 for neox mode
  3918. Qcur = ggml_rope_ext(
  3919. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  3920. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  3921. );
  3922. cb(Qcur, "Qcur", il);
  3923. Kcur = ggml_rope_ext(
  3924. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  3925. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  3926. );
  3927. cb(Kcur, "Kcur", il);
  3928. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  3929. model.layers[il].wo, NULL,
  3930. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  3931. }
  3932. if (il == n_layer - 1) {
  3933. // skip computing output for unused tokens
  3934. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  3935. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  3936. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  3937. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  3938. }
  3939. struct ggml_tensor * ffn_inp = cur;
  3940. // feed forward
  3941. {
  3942. cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
  3943. model.layers[il].ffn_up, NULL, NULL,
  3944. NULL, NULL, NULL,
  3945. model.layers[il].ffn_down, NULL, NULL,
  3946. NULL,
  3947. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  3948. cb(cur, "ffn_out", il);
  3949. }
  3950. cur = ggml_add(ctx0, cur, ffn_inp);
  3951. cur = ggml_add(ctx0, cur, inpL);
  3952. cur = lctx.cvec.apply_to(ctx0, cur, il);
  3953. cb(cur, "l_out", il);
  3954. // input for next layer
  3955. inpL = cur;
  3956. }
  3957. cur = inpL;
  3958. // norm
  3959. cur = llm_build_norm(ctx0, cur, hparams,
  3960. model.output_norm,
  3961. model.output_norm_b,
  3962. LLM_NORM, cb, -1);
  3963. cb(cur, "result_norm", -1);
  3964. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  3965. cb(cur, "result_output", -1);
  3966. ggml_build_forward_expand(gf, cur);
  3967. return gf;
  3968. }
  3969. struct ggml_cgraph * build_grok() {
  3970. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  3971. // mutable variable, needed during the last layer of the computation to skip unused tokens
  3972. int32_t n_tokens = this->n_tokens;
  3973. const int64_t n_embd_head = hparams.n_embd_head_v;
  3974. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  3975. GGML_ASSERT(n_embd_head == hparams.n_rot);
  3976. struct ggml_tensor * cur;
  3977. struct ggml_tensor * inpL;
  3978. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  3979. // multiply by embedding_multiplier_scale of 78.38367176906169
  3980. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  3981. // inp_pos - contains the positions
  3982. struct ggml_tensor * inp_pos = build_inp_pos();
  3983. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  3984. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  3985. for (int il = 0; il < n_layer; ++il) {
  3986. struct ggml_tensor * inpSA = inpL;
  3987. // norm
  3988. cur = llm_build_norm(ctx0, inpL, hparams,
  3989. model.layers[il].attn_norm, NULL,
  3990. LLM_NORM_RMS, cb, il);
  3991. cb(cur, "attn_norm", il);
  3992. // self-attention
  3993. {
  3994. // compute Q and K and RoPE them
  3995. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  3996. cb(Qcur, "Qcur", il);
  3997. if (model.layers[il].bq) {
  3998. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  3999. cb(Qcur, "Qcur", il);
  4000. }
  4001. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  4002. cb(Kcur, "Kcur", il);
  4003. if (model.layers[il].bk) {
  4004. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4005. cb(Kcur, "Kcur", il);
  4006. }
  4007. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  4008. cb(Vcur, "Vcur", il);
  4009. if (model.layers[il].bv) {
  4010. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4011. cb(Vcur, "Vcur", il);
  4012. }
  4013. Qcur = ggml_rope_ext(
  4014. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  4015. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4016. ext_factor, attn_factor, beta_fast, beta_slow
  4017. );
  4018. cb(Qcur, "Qcur", il);
  4019. Kcur = ggml_rope_ext(
  4020. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  4021. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4022. ext_factor, attn_factor, beta_fast, beta_slow
  4023. );
  4024. cb(Kcur, "Kcur", il);
  4025. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4026. model.layers[il].wo, model.layers[il].bo,
  4027. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  4028. }
  4029. if (il == n_layer - 1) {
  4030. // skip computing output for unused tokens
  4031. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4032. n_tokens = n_outputs;
  4033. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4034. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4035. }
  4036. // Grok
  4037. // if attn_out_norm is present then apply it before adding the input
  4038. if (model.layers[il].attn_out_norm) {
  4039. cur = llm_build_norm(ctx0, cur, hparams,
  4040. model.layers[il].attn_out_norm, NULL,
  4041. LLM_NORM_RMS, cb, il);
  4042. cb(cur, "attn_out_norm", il);
  4043. }
  4044. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4045. cb(ffn_inp, "ffn_inp", il);
  4046. // feed-forward network
  4047. // MoE branch
  4048. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4049. model.layers[il].ffn_norm, NULL,
  4050. LLM_NORM_RMS, cb, il);
  4051. cb(cur, "ffn_norm", il);
  4052. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  4053. model.layers[il].ffn_gate_inp,
  4054. model.layers[il].ffn_up_exps,
  4055. model.layers[il].ffn_gate_exps,
  4056. model.layers[il].ffn_down_exps,
  4057. nullptr,
  4058. n_expert, n_expert_used,
  4059. LLM_FFN_GELU, true,
  4060. false, 0.0,
  4061. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4062. cb, il);
  4063. cb(cur, "ffn_moe_out", il);
  4064. // Grok
  4065. // if layer_out_norm is present then apply it before adding the input
  4066. // Idea: maybe ffn_out_norm is a better name
  4067. if (model.layers[il].layer_out_norm) {
  4068. cur = llm_build_norm(ctx0, cur, hparams,
  4069. model.layers[il].layer_out_norm, NULL,
  4070. LLM_NORM_RMS, cb, il);
  4071. cb(cur, "layer_out_norm", il);
  4072. }
  4073. cur = ggml_add(ctx0, cur, ffn_inp);
  4074. cb(cur, "ffn_out", il);
  4075. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4076. cb(cur, "l_out", il);
  4077. // input for next layer
  4078. inpL = cur;
  4079. }
  4080. cur = inpL;
  4081. cur = llm_build_norm(ctx0, cur, hparams,
  4082. model.output_norm, NULL,
  4083. LLM_NORM_RMS, cb, -1);
  4084. cb(cur, "result_norm", -1);
  4085. // lm_head
  4086. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4087. // Grok
  4088. // multiply logits by output_multiplier_scale of 0.5773502691896257
  4089. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  4090. cb(cur, "result_output", -1);
  4091. ggml_build_forward_expand(gf, cur);
  4092. return gf;
  4093. }
  4094. struct ggml_cgraph * build_dbrx() {
  4095. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4096. // mutable variable, needed during the last layer of the computation to skip unused tokens
  4097. int32_t n_tokens = this->n_tokens;
  4098. const int64_t n_embd_head = hparams.n_embd_head_v;
  4099. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4100. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4101. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4102. struct ggml_tensor * cur;
  4103. struct ggml_tensor * inpL;
  4104. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4105. // inp_pos - contains the positions
  4106. struct ggml_tensor * inp_pos = build_inp_pos();
  4107. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4108. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4109. for (int il = 0; il < n_layer; ++il) {
  4110. struct ggml_tensor * inpSA = inpL;
  4111. // norm
  4112. cur = llm_build_norm(ctx0, inpL, hparams,
  4113. model.layers[il].attn_norm, NULL,
  4114. LLM_NORM, cb, il);
  4115. cb(cur, "attn_norm", il);
  4116. // self-attention
  4117. {
  4118. struct ggml_tensor * Qcur = nullptr;
  4119. struct ggml_tensor * Kcur = nullptr;
  4120. struct ggml_tensor * Vcur = nullptr;
  4121. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4122. cb(cur, "wqkv", il);
  4123. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4124. cb(cur, "wqkv_clamped", il);
  4125. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4126. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4127. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4128. cb(Qcur, "Qcur", il);
  4129. cb(Kcur, "Kcur", il);
  4130. cb(Vcur, "Vcur", il);
  4131. Qcur = ggml_rope_ext(
  4132. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  4133. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4134. ext_factor, attn_factor, beta_fast, beta_slow
  4135. );
  4136. cb(Qcur, "Qcur", il);
  4137. Kcur = ggml_rope_ext(
  4138. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  4139. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4140. ext_factor, attn_factor, beta_fast, beta_slow
  4141. );
  4142. cb(Kcur, "Kcur", il);
  4143. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4144. model.layers[il].wo, NULL,
  4145. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4146. }
  4147. if (il == n_layer - 1) {
  4148. // skip computing output for unused tokens
  4149. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4150. n_tokens = n_outputs;
  4151. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4152. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4153. }
  4154. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4155. cb(ffn_inp, "ffn_inp", il);
  4156. // feed-forward network
  4157. // MoE branch
  4158. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4159. model.layers[il].attn_out_norm, NULL,
  4160. LLM_NORM, cb, il);
  4161. cb(cur, "attn_out_norm", il);
  4162. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  4163. model.layers[il].ffn_gate_inp,
  4164. model.layers[il].ffn_up_exps,
  4165. model.layers[il].ffn_gate_exps,
  4166. model.layers[il].ffn_down_exps,
  4167. nullptr,
  4168. n_expert, n_expert_used,
  4169. LLM_FFN_SILU, true,
  4170. false, 0.0,
  4171. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  4172. cb, il);
  4173. cb(cur, "ffn_moe_out", il);
  4174. cur = ggml_add(ctx0, cur, ffn_inp);
  4175. cb(cur, "ffn_out", il);
  4176. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4177. cb(cur, "l_out", il);
  4178. // input for next layer
  4179. inpL = cur;
  4180. }
  4181. cur = inpL;
  4182. cur = llm_build_norm(ctx0, cur, hparams,
  4183. model.output_norm, NULL,
  4184. LLM_NORM, cb, -1);
  4185. cb(cur, "result_norm", -1);
  4186. // lm_head
  4187. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4188. cb(cur, "result_output", -1);
  4189. ggml_build_forward_expand(gf, cur);
  4190. return gf;
  4191. }
  4192. struct ggml_cgraph * build_starcoder() {
  4193. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4194. const int64_t n_embd_head = hparams.n_embd_head_v;
  4195. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4196. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4197. struct ggml_tensor * cur;
  4198. struct ggml_tensor * inpL;
  4199. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4200. // inp_pos - contains the positions
  4201. struct ggml_tensor * inp_pos = build_inp_pos();
  4202. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4203. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4204. struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4205. cb(pos, "pos_embd", -1);
  4206. inpL = ggml_add(ctx0, inpL, pos);
  4207. cb(inpL, "inpL", -1);
  4208. for (int il = 0; il < n_layer; ++il) {
  4209. cur = llm_build_norm(ctx0, inpL, hparams,
  4210. model.layers[il].attn_norm,
  4211. model.layers[il].attn_norm_b,
  4212. LLM_NORM, cb, il);
  4213. cb(cur, "attn_norm", il);
  4214. // self-attention
  4215. {
  4216. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4217. cb(cur, "wqkv", il);
  4218. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4219. cb(cur, "bqkv", il);
  4220. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4221. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4222. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4223. cb(Qcur, "Qcur", il);
  4224. cb(Kcur, "Kcur", il);
  4225. cb(Vcur, "Vcur", il);
  4226. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4227. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4228. model.layers[il].wo, model.layers[il].bo,
  4229. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4230. }
  4231. if (il == n_layer - 1) {
  4232. // skip computing output for unused tokens
  4233. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4234. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4235. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4236. }
  4237. // add the input
  4238. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4239. cb(ffn_inp, "ffn_inp", il);
  4240. // FF
  4241. {
  4242. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4243. model.layers[il].ffn_norm,
  4244. model.layers[il].ffn_norm_b,
  4245. LLM_NORM, cb, il);
  4246. cb(cur, "ffn_norm", il);
  4247. cur = llm_build_ffn(ctx0, lctx, cur,
  4248. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4249. NULL, NULL, NULL,
  4250. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4251. NULL,
  4252. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4253. cb(cur, "ffn_out", il);
  4254. }
  4255. cur = ggml_add(ctx0, cur, ffn_inp);
  4256. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4257. cb(cur, "l_out", il);
  4258. // input for next layer
  4259. inpL = cur;
  4260. }
  4261. cur = llm_build_norm(ctx0, inpL, hparams,
  4262. model.output_norm,
  4263. model.output_norm_b,
  4264. LLM_NORM, cb, -1);
  4265. cb(cur, "result_norm", -1);
  4266. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4267. cb(cur, "result_output", -1);
  4268. ggml_build_forward_expand(gf, cur);
  4269. return gf;
  4270. }
  4271. struct ggml_cgraph * build_refact() {
  4272. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4273. const int64_t n_embd_head = hparams.n_embd_head_v;
  4274. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4275. struct ggml_tensor * cur;
  4276. struct ggml_tensor * inpL;
  4277. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4278. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4279. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4280. for (int il = 0; il < n_layer; ++il) {
  4281. struct ggml_tensor * inpSA = inpL;
  4282. cur = llm_build_norm(ctx0, inpL, hparams,
  4283. model.layers[il].attn_norm, NULL,
  4284. LLM_NORM_RMS, cb, il);
  4285. cb(cur, "attn_norm", il);
  4286. // self-attention
  4287. {
  4288. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  4289. cb(Qcur, "Qcur", il);
  4290. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  4291. cb(Kcur, "Kcur", il);
  4292. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  4293. cb(Vcur, "Vcur", il);
  4294. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4295. cb(Kcur, "Kcur", il);
  4296. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4297. cb(Qcur, "Qcur", il);
  4298. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4299. model.layers[il].wo, NULL,
  4300. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4301. }
  4302. if (il == n_layer - 1) {
  4303. // skip computing output for unused tokens
  4304. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4305. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4306. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4307. }
  4308. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4309. cb(ffn_inp, "ffn_inp", il);
  4310. // feed-forward network
  4311. {
  4312. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4313. model.layers[il].ffn_norm, NULL,
  4314. LLM_NORM_RMS, cb, il);
  4315. cb(cur, "ffn_norm", il);
  4316. cur = llm_build_ffn(ctx0, lctx, cur,
  4317. model.layers[il].ffn_up, NULL, NULL,
  4318. model.layers[il].ffn_gate, NULL, NULL,
  4319. model.layers[il].ffn_down, NULL, NULL,
  4320. NULL,
  4321. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4322. cb(cur, "ffn_out", il);
  4323. }
  4324. cur = ggml_add(ctx0, cur, ffn_inp);
  4325. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4326. cb(cur, "l_out", il);
  4327. // input for next layer
  4328. inpL = cur;
  4329. }
  4330. cur = inpL;
  4331. cur = llm_build_norm(ctx0, cur, hparams,
  4332. model.output_norm, NULL,
  4333. LLM_NORM_RMS, cb, -1);
  4334. cb(cur, "result_norm", -1);
  4335. // lm_head
  4336. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4337. cb(cur, "result_output", -1);
  4338. ggml_build_forward_expand(gf, cur);
  4339. return gf;
  4340. }
  4341. struct ggml_cgraph * build_bert() {
  4342. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4343. const int64_t n_embd_head = hparams.n_embd_head_v;
  4344. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4345. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4346. struct ggml_tensor * cur;
  4347. struct ggml_tensor * inpL;
  4348. struct ggml_tensor * inp_pos = nullptr;
  4349. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  4350. inp_pos = build_inp_pos();
  4351. }
  4352. // construct input embeddings (token, type, position)
  4353. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4354. // token types are hardcoded to zero ("Sentence A")
  4355. struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  4356. inpL = ggml_add(ctx0, inpL, type_row0);
  4357. if (model.arch == LLM_ARCH_BERT) {
  4358. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  4359. }
  4360. cb(inpL, "inp_embd", -1);
  4361. // embed layer norm
  4362. inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
  4363. cb(inpL, "inp_norm", -1);
  4364. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4365. struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
  4366. // iterate layers
  4367. for (int il = 0; il < n_layer; ++il) {
  4368. struct ggml_tensor * cur = inpL;
  4369. struct ggml_tensor * Qcur;
  4370. struct ggml_tensor * Kcur;
  4371. struct ggml_tensor * Vcur;
  4372. // self-attention
  4373. if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
  4374. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
  4375. cb(Qcur, "Qcur", il);
  4376. if (model.layers[il].attn_q_norm) {
  4377. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  4378. model.layers[il].attn_q_norm,
  4379. model.layers[il].attn_q_norm_b,
  4380. LLM_NORM, cb, il);
  4381. }
  4382. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
  4383. cb(Kcur, "Kcur", il);
  4384. if (model.layers[il].attn_k_norm) {
  4385. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  4386. model.layers[il].attn_k_norm,
  4387. model.layers[il].attn_k_norm_b,
  4388. LLM_NORM, cb, il);
  4389. }
  4390. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
  4391. cb(Vcur, "Vcur", il);
  4392. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4393. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4394. } else {
  4395. // compute Q and K and RoPE them
  4396. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4397. cb(cur, "wqkv", il);
  4398. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4399. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4400. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4401. cb(Qcur, "Qcur", il);
  4402. cb(Kcur, "Kcur", il);
  4403. cb(Vcur, "Vcur", il);
  4404. Qcur = ggml_rope_ext(
  4405. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  4406. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4407. ext_factor, attn_factor, beta_fast, beta_slow
  4408. );
  4409. cb(Qcur, "Qcur", il);
  4410. Kcur = ggml_rope_ext(
  4411. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  4412. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4413. ext_factor, attn_factor, beta_fast, beta_slow
  4414. );
  4415. cb(Kcur, "Kcur", il);
  4416. }
  4417. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  4418. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  4419. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  4420. cb(kq, "kq", il);
  4421. kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
  4422. cb(kq, "kq_soft_max_ext", il);
  4423. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
  4424. cb(v, "v", il);
  4425. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
  4426. cb(kqv, "kqv", il);
  4427. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  4428. cb(kqv_merged, "kqv_merged", il);
  4429. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  4430. cb(cur, "kqv_merged_cont", il);
  4431. ggml_build_forward_expand(gf, cur);
  4432. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  4433. if (model.layers[il].bo) {
  4434. cb(cur, "kqv_wo", il);
  4435. }
  4436. if (model.layers[il].bo) {
  4437. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  4438. }
  4439. cb(cur, "kqv_out", il);
  4440. if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
  4441. // skip computing output for unused tokens
  4442. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4443. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4444. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4445. }
  4446. // re-add the layer input
  4447. cur = ggml_add(ctx0, cur, inpL);
  4448. // attention layer norm
  4449. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
  4450. if (model.layers[il].attn_norm_2 != nullptr) {
  4451. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  4452. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
  4453. }
  4454. struct ggml_tensor * ffn_inp = cur;
  4455. cb(ffn_inp, "ffn_inp", il);
  4456. // feed-forward network
  4457. if (model.arch == LLM_ARCH_BERT) {
  4458. cur = llm_build_ffn(ctx0, lctx, cur,
  4459. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4460. NULL, NULL, NULL,
  4461. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4462. NULL,
  4463. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4464. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  4465. cur = llm_build_ffn(ctx0, lctx, cur,
  4466. model.layers[il].ffn_up, NULL, NULL,
  4467. model.layers[il].ffn_gate, NULL, NULL,
  4468. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4469. NULL,
  4470. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  4471. } else {
  4472. cur = llm_build_ffn(ctx0, lctx, cur,
  4473. model.layers[il].ffn_up, NULL, NULL,
  4474. model.layers[il].ffn_gate, NULL, NULL,
  4475. model.layers[il].ffn_down, NULL, NULL,
  4476. NULL,
  4477. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4478. }
  4479. cb(cur, "ffn_out", il);
  4480. // attentions bypass the intermediate layer
  4481. cur = ggml_add(ctx0, cur, ffn_inp);
  4482. // output layer norm
  4483. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
  4484. // input for next layer
  4485. inpL = cur;
  4486. }
  4487. cur = inpL;
  4488. cb(cur, "result_embd", -1);
  4489. ggml_build_forward_expand(gf, cur);
  4490. return gf;
  4491. }
  4492. struct ggml_cgraph * build_bloom() {
  4493. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4494. const int64_t n_embd_head = hparams.n_embd_head_v;
  4495. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4496. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4497. struct ggml_tensor * cur;
  4498. struct ggml_tensor * inpL;
  4499. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4500. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4501. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4502. inpL = llm_build_norm(ctx0, inpL, hparams,
  4503. model.tok_norm,
  4504. model.tok_norm_b,
  4505. LLM_NORM, cb, -1);
  4506. cb(inpL, "inp_norm", -1);
  4507. for (int il = 0; il < n_layer; ++il) {
  4508. cur = llm_build_norm(ctx0, inpL, hparams,
  4509. model.layers[il].attn_norm,
  4510. model.layers[il].attn_norm_b,
  4511. LLM_NORM, cb, il);
  4512. cb(cur, "attn_norm", il);
  4513. // self-attention
  4514. {
  4515. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4516. cb(cur, "wqkv", il);
  4517. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4518. cb(cur, "bqkv", il);
  4519. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4520. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4521. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4522. cb(Qcur, "Qcur", il);
  4523. cb(Kcur, "Kcur", il);
  4524. cb(Vcur, "Vcur", il);
  4525. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4526. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4527. model.layers[il].wo, model.layers[il].bo,
  4528. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4529. }
  4530. if (il == n_layer - 1) {
  4531. // skip computing output for unused tokens
  4532. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4533. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4534. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4535. }
  4536. // Add the input
  4537. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4538. cb(ffn_inp, "ffn_inp", il);
  4539. // FF
  4540. {
  4541. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4542. model.layers[il].ffn_norm,
  4543. model.layers[il].ffn_norm_b,
  4544. LLM_NORM, cb, il);
  4545. cb(cur, "ffn_norm", il);
  4546. cur = llm_build_ffn(ctx0, lctx, cur,
  4547. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4548. NULL, NULL, NULL,
  4549. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4550. NULL,
  4551. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4552. cb(cur, "ffn_out", il);
  4553. }
  4554. cur = ggml_add(ctx0, cur, ffn_inp);
  4555. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4556. cb(cur, "l_out", il);
  4557. // input for next layer
  4558. inpL = cur;
  4559. }
  4560. cur = llm_build_norm(ctx0, inpL, hparams,
  4561. model.output_norm,
  4562. model.output_norm_b,
  4563. LLM_NORM, cb, -1);
  4564. cb(cur, "result_norm", -1);
  4565. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4566. cb(cur, "result_output", -1);
  4567. ggml_build_forward_expand(gf, cur);
  4568. return gf;
  4569. }
  4570. struct ggml_cgraph * build_mpt() {
  4571. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4572. const int64_t n_embd_head = hparams.n_embd_head_v;
  4573. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  4574. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4575. struct ggml_tensor * cur;
  4576. struct ggml_tensor * pos;
  4577. struct ggml_tensor * inpL;
  4578. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4579. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4580. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4581. if (model.pos_embd) {
  4582. // inp_pos - contains the positions
  4583. struct ggml_tensor * inp_pos = build_inp_pos();
  4584. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  4585. cb(pos, "pos_embd", -1);
  4586. inpL = ggml_add(ctx0, inpL, pos);
  4587. cb(inpL, "inpL", -1);
  4588. }
  4589. for (int il = 0; il < n_layer; ++il) {
  4590. struct ggml_tensor * attn_norm;
  4591. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  4592. model.layers[il].attn_norm,
  4593. model.layers[il].attn_norm_b,
  4594. LLM_NORM, cb, il);
  4595. cb(attn_norm, "attn_norm", il);
  4596. // self-attention
  4597. {
  4598. cur = attn_norm;
  4599. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4600. cb(cur, "wqkv", il);
  4601. if (model.layers[il].bqkv){
  4602. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4603. cb(cur, "bqkv", il);
  4604. }
  4605. if (hparams.f_clamp_kqv > 0.0f) {
  4606. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  4607. cb(cur, "wqkv_clamped", il);
  4608. }
  4609. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4610. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4611. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  4612. cb(Qcur, "Qcur", il);
  4613. cb(Kcur, "Kcur", il);
  4614. cb(Vcur, "Vcur", il);
  4615. // Q/K Layernorm
  4616. if (model.layers[il].attn_q_norm) {
  4617. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  4618. model.layers[il].attn_q_norm,
  4619. model.layers[il].attn_q_norm_b,
  4620. LLM_NORM, cb, il);
  4621. cb(Qcur, "Qcur", il);
  4622. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  4623. model.layers[il].attn_k_norm,
  4624. model.layers[il].attn_k_norm_b,
  4625. LLM_NORM, cb, il);
  4626. cb(Kcur, "Kcur", il);
  4627. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4628. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4629. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4630. model.layers[il].wo, model.layers[il].bo,
  4631. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4632. } else {
  4633. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4634. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4635. model.layers[il].wo, model.layers[il].bo,
  4636. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4637. }
  4638. }
  4639. if (il == n_layer - 1) {
  4640. // skip computing output for unused tokens
  4641. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4642. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4643. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4644. }
  4645. // Add the input
  4646. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4647. cb(ffn_inp, "ffn_inp", il);
  4648. // feed forward
  4649. {
  4650. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4651. model.layers[il].ffn_norm,
  4652. model.layers[il].ffn_norm_b,
  4653. LLM_NORM, cb, il);
  4654. cb(cur, "ffn_norm", il);
  4655. cur = llm_build_ffn(ctx0, lctx, cur,
  4656. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  4657. NULL, NULL, NULL,
  4658. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  4659. model.layers[il].ffn_act,
  4660. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  4661. cb(cur, "ffn_out", il);
  4662. }
  4663. cur = ggml_add(ctx0, cur, ffn_inp);
  4664. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4665. cb(cur, "l_out", il);
  4666. // input for next layer
  4667. inpL = cur;
  4668. }
  4669. cur = inpL;
  4670. cur = llm_build_norm(ctx0, cur, hparams,
  4671. model.output_norm,
  4672. model.output_norm_b,
  4673. LLM_NORM, cb, -1);
  4674. cb(cur, "result_norm", -1);
  4675. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4676. cb(cur, "result_output", -1);
  4677. ggml_build_forward_expand(gf, cur);
  4678. return gf;
  4679. }
  4680. struct ggml_cgraph * build_stablelm() {
  4681. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  4682. const int64_t n_embd_head = hparams.n_embd_head_v;
  4683. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4684. struct ggml_tensor * cur;
  4685. struct ggml_tensor * inpL;
  4686. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4687. // inp_pos - contains the positions
  4688. struct ggml_tensor * inp_pos = build_inp_pos();
  4689. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4690. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4691. for (int il = 0; il < n_layer; ++il) {
  4692. // norm
  4693. cur = llm_build_norm(ctx0, inpL, hparams,
  4694. model.layers[il].attn_norm,
  4695. model.layers[il].attn_norm_b,
  4696. LLM_NORM, cb, il);
  4697. cb(cur, "attn_norm", il);
  4698. struct ggml_tensor * inpSA = cur;
  4699. // self-attention
  4700. {
  4701. // compute Q and K and RoPE them
  4702. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  4703. cb(Qcur, "Qcur", il);
  4704. if (model.layers[il].bq) {
  4705. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4706. cb(Qcur, "Qcur", il);
  4707. }
  4708. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  4709. cb(Kcur, "Kcur", il);
  4710. if (model.layers[il].bk) {
  4711. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4712. cb(Kcur, "Kcur", il);
  4713. }
  4714. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  4715. cb(Vcur, "Vcur", il);
  4716. if (model.layers[il].bv) {
  4717. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4718. cb(Vcur, "Vcur", il);
  4719. }
  4720. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4721. cb(Qcur, "Qcur", il);
  4722. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4723. cb(Kcur, "Kcur", il);
  4724. if (model.layers[il].attn_q_norm) {
  4725. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  4726. model.layers[il].attn_q_norm,
  4727. NULL,
  4728. LLM_NORM, cb, il);
  4729. cb(Qcur, "Qcur", il);
  4730. }
  4731. if (model.layers[il].attn_k_norm) {
  4732. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  4733. model.layers[il].attn_k_norm,
  4734. NULL,
  4735. LLM_NORM, cb, il);
  4736. cb(Kcur, "Kcur", il);
  4737. }
  4738. Qcur = ggml_rope_ext(
  4739. ctx0, Qcur, inp_pos, nullptr,
  4740. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4741. ext_factor, attn_factor, beta_fast, beta_slow
  4742. );
  4743. cb(Qcur, "Qcur", il);
  4744. Kcur = ggml_rope_ext(
  4745. ctx0, Kcur, inp_pos, nullptr,
  4746. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4747. ext_factor, attn_factor, beta_fast, beta_slow
  4748. );
  4749. cb(Kcur, "Kcur", il);
  4750. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4751. model.layers[il].wo, NULL,
  4752. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4753. }
  4754. if (il == n_layer - 1) {
  4755. // skip computing output for unused tokens
  4756. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4757. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4758. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  4759. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4760. }
  4761. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  4762. cb(ffn_inp, "ffn_inp", il);
  4763. // feed-forward network
  4764. {
  4765. if (model.layers[il].ffn_norm) {
  4766. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4767. model.layers[il].ffn_norm,
  4768. model.layers[il].ffn_norm_b,
  4769. LLM_NORM, cb, il);
  4770. cb(cur, "ffn_norm", il);
  4771. } else {
  4772. // parallel residual
  4773. cur = inpSA;
  4774. }
  4775. cur = llm_build_ffn(ctx0, lctx, cur,
  4776. model.layers[il].ffn_up, NULL, NULL,
  4777. model.layers[il].ffn_gate, NULL, NULL,
  4778. model.layers[il].ffn_down, NULL, NULL,
  4779. NULL,
  4780. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4781. cb(cur, "ffn_out", il);
  4782. }
  4783. cur = ggml_add(ctx0, cur, ffn_inp);
  4784. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4785. cb(cur, "l_out", il);
  4786. // input for next layer
  4787. inpL = cur;
  4788. }
  4789. cur = inpL;
  4790. cur = llm_build_norm(ctx0, cur, hparams,
  4791. model.output_norm,
  4792. model.output_norm_b,
  4793. LLM_NORM, cb, -1);
  4794. cb(cur, "result_norm", -1);
  4795. // lm_head
  4796. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4797. cb(cur, "result_output", -1);
  4798. ggml_build_forward_expand(gf, cur);
  4799. return gf;
  4800. }
  4801. struct ggml_cgraph * build_qwen() {
  4802. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4803. const int64_t n_embd_head = hparams.n_embd_head_v;
  4804. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4805. struct ggml_tensor * cur;
  4806. struct ggml_tensor * inpL;
  4807. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4808. // inp_pos - contains the positions
  4809. struct ggml_tensor * inp_pos = build_inp_pos();
  4810. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4811. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4812. for (int il = 0; il < n_layer; ++il) {
  4813. struct ggml_tensor * inpSA = inpL;
  4814. cur = llm_build_norm(ctx0, inpL, hparams,
  4815. model.layers[il].attn_norm, NULL,
  4816. LLM_NORM_RMS, cb, il);
  4817. cb(cur, "attn_norm", il);
  4818. // self-attention
  4819. {
  4820. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  4821. cb(cur, "wqkv", il);
  4822. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  4823. cb(cur, "bqkv", il);
  4824. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  4825. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  4826. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  4827. cb(Qcur, "Qcur", il);
  4828. cb(Kcur, "Kcur", il);
  4829. cb(Vcur, "Vcur", il);
  4830. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  4831. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  4832. // using mode = 2 for neox mode
  4833. Qcur = ggml_rope_ext(
  4834. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  4835. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4836. );
  4837. cb(Qcur, "Qcur", il);
  4838. Kcur = ggml_rope_ext(
  4839. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  4840. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  4841. );
  4842. cb(Kcur, "Kcur", il);
  4843. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4844. model.layers[il].wo, NULL,
  4845. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4846. }
  4847. if (il == n_layer - 1) {
  4848. // skip computing output for unused tokens
  4849. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4850. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4851. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4852. }
  4853. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4854. cb(ffn_inp, "ffn_inp", il);
  4855. // feed-forward forward
  4856. {
  4857. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4858. model.layers[il].ffn_norm, NULL,
  4859. LLM_NORM_RMS, cb, il);
  4860. cb(cur, "ffn_norm", il);
  4861. cur = llm_build_ffn(ctx0, lctx, cur,
  4862. model.layers[il].ffn_up, NULL, NULL,
  4863. model.layers[il].ffn_gate, NULL, NULL,
  4864. model.layers[il].ffn_down, NULL, NULL,
  4865. NULL,
  4866. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4867. cb(cur, "ffn_out", il);
  4868. }
  4869. cur = ggml_add(ctx0, cur, ffn_inp);
  4870. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4871. cb(cur, "l_out", il);
  4872. // input for next layer
  4873. inpL = cur;
  4874. }
  4875. cur = inpL;
  4876. cur = llm_build_norm(ctx0, cur, hparams,
  4877. model.output_norm, NULL,
  4878. LLM_NORM_RMS, cb, -1);
  4879. cb(cur, "result_norm", -1);
  4880. // lm_head
  4881. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4882. cb(cur, "result_output", -1);
  4883. ggml_build_forward_expand(gf, cur);
  4884. return gf;
  4885. }
  4886. struct ggml_cgraph * build_qwen2() {
  4887. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4888. const int64_t n_embd_head = hparams.n_embd_head_v;
  4889. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4890. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4891. struct ggml_tensor * cur;
  4892. struct ggml_tensor * inpL;
  4893. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4894. // inp_pos - contains the positions
  4895. struct ggml_tensor * inp_pos = build_inp_pos();
  4896. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4897. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4898. for (int il = 0; il < n_layer; ++il) {
  4899. struct ggml_tensor * inpSA = inpL;
  4900. // norm
  4901. cur = llm_build_norm(ctx0, inpL, hparams,
  4902. model.layers[il].attn_norm, NULL,
  4903. LLM_NORM_RMS, cb, il);
  4904. cb(cur, "attn_norm", il);
  4905. // self-attention
  4906. {
  4907. // compute Q and K and RoPE them
  4908. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  4909. cb(Qcur, "Qcur", il);
  4910. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  4911. cb(Qcur, "Qcur", il);
  4912. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  4913. cb(Kcur, "Kcur", il);
  4914. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  4915. cb(Kcur, "Kcur", il);
  4916. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  4917. cb(Vcur, "Vcur", il);
  4918. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  4919. cb(Vcur, "Vcur", il);
  4920. Qcur = ggml_rope_ext(
  4921. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  4922. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4923. ext_factor, attn_factor, beta_fast, beta_slow
  4924. );
  4925. cb(Qcur, "Qcur", il);
  4926. Kcur = ggml_rope_ext(
  4927. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  4928. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  4929. ext_factor, attn_factor, beta_fast, beta_slow
  4930. );
  4931. cb(Kcur, "Kcur", il);
  4932. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  4933. model.layers[il].wo, model.layers[il].bo,
  4934. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  4935. }
  4936. if (il == n_layer - 1) {
  4937. // skip computing output for unused tokens
  4938. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  4939. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  4940. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  4941. }
  4942. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  4943. cb(ffn_inp, "ffn_inp", il);
  4944. // feed-forward network
  4945. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  4946. model.layers[il].ffn_norm, NULL,
  4947. LLM_NORM_RMS, cb, il);
  4948. cb(cur, "ffn_norm", il);
  4949. cur = llm_build_ffn(ctx0, lctx, cur,
  4950. model.layers[il].ffn_up, NULL, NULL,
  4951. model.layers[il].ffn_gate, NULL, NULL,
  4952. model.layers[il].ffn_down, NULL, NULL,
  4953. NULL,
  4954. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  4955. cb(cur, "ffn_out", il);
  4956. cur = ggml_add(ctx0, cur, ffn_inp);
  4957. cur = lctx.cvec.apply_to(ctx0, cur, il);
  4958. cb(cur, "l_out", il);
  4959. // input for next layer
  4960. inpL = cur;
  4961. }
  4962. cur = inpL;
  4963. cur = llm_build_norm(ctx0, cur, hparams,
  4964. model.output_norm, NULL,
  4965. LLM_NORM_RMS, cb, -1);
  4966. cb(cur, "result_norm", -1);
  4967. // lm_head
  4968. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  4969. cb(cur, "result_output", -1);
  4970. ggml_build_forward_expand(gf, cur);
  4971. return gf;
  4972. }
  4973. struct ggml_cgraph * build_qwen2vl() {
  4974. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  4975. const int64_t n_embd_head = hparams.n_embd_head_v;
  4976. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  4977. GGML_ASSERT(n_embd_head == hparams.n_rot);
  4978. struct ggml_tensor * cur;
  4979. struct ggml_tensor * inpL;
  4980. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  4981. // inp_pos - contains the positions
  4982. lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
  4983. cb(lctx.inp_pos, "inp_pos", -1);
  4984. ggml_set_input(lctx.inp_pos);
  4985. struct ggml_tensor * inp_pos = lctx.inp_pos;
  4986. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  4987. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  4988. int sections[4];
  4989. std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
  4990. for (int il = 0; il < n_layer; ++il) {
  4991. struct ggml_tensor * inpSA = inpL;
  4992. // norm
  4993. cur = llm_build_norm(ctx0, inpL, hparams,
  4994. model.layers[il].attn_norm, NULL,
  4995. LLM_NORM_RMS, cb, il);
  4996. cb(cur, "attn_norm", il);
  4997. // self-attention
  4998. {
  4999. // compute Q and K and RoPE them
  5000. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  5001. cb(Qcur, "Qcur", il);
  5002. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5003. cb(Qcur, "Qcur", il);
  5004. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  5005. cb(Kcur, "Kcur", il);
  5006. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5007. cb(Kcur, "Kcur", il);
  5008. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  5009. cb(Vcur, "Vcur", il);
  5010. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5011. cb(Vcur, "Vcur", il);
  5012. Qcur = ggml_rope_multi(
  5013. ctx0,
  5014. ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  5015. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5016. ext_factor, attn_factor, beta_fast, beta_slow
  5017. );
  5018. cb(Qcur, "Qcur", il);
  5019. Kcur = ggml_rope_multi(
  5020. ctx0,
  5021. ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  5022. n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
  5023. ext_factor, attn_factor, beta_fast, beta_slow
  5024. );
  5025. cb(Kcur, "Kcur", il);
  5026. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5027. model.layers[il].wo, model.layers[il].bo,
  5028. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5029. }
  5030. if (il == n_layer - 1) {
  5031. // skip computing output for unused tokens
  5032. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5033. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5034. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5035. }
  5036. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5037. cb(ffn_inp, "ffn_inp", il);
  5038. // feed-forward network
  5039. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5040. model.layers[il].ffn_norm, NULL,
  5041. LLM_NORM_RMS, cb, il);
  5042. cb(cur, "ffn_norm", il);
  5043. cur = llm_build_ffn(ctx0, lctx, cur,
  5044. model.layers[il].ffn_up, NULL, NULL,
  5045. model.layers[il].ffn_gate, NULL, NULL,
  5046. model.layers[il].ffn_down, NULL, NULL,
  5047. NULL,
  5048. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5049. cb(cur, "ffn_out", il);
  5050. cur = ggml_add(ctx0, cur, ffn_inp);
  5051. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5052. cb(cur, "l_out", il);
  5053. // input for next layer
  5054. inpL = cur;
  5055. }
  5056. cur = inpL;
  5057. cur = llm_build_norm(ctx0, cur, hparams,
  5058. model.output_norm, NULL,
  5059. LLM_NORM_RMS, cb, -1);
  5060. cb(cur, "result_norm", -1);
  5061. // lm_head
  5062. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5063. cb(cur, "result_output", -1);
  5064. ggml_build_forward_expand(gf, cur);
  5065. return gf;
  5066. }
  5067. struct ggml_cgraph * build_qwen2moe() {
  5068. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5069. // mutable variable, needed during the last layer of the computation to skip unused tokens
  5070. int32_t n_tokens = this->n_tokens;
  5071. const int64_t n_embd_head = hparams.n_embd_head_v;
  5072. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5073. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5074. struct ggml_tensor * cur;
  5075. struct ggml_tensor * inpL;
  5076. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5077. // inp_pos - contains the positions
  5078. struct ggml_tensor * inp_pos = build_inp_pos();
  5079. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5080. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5081. for (int il = 0; il < n_layer; ++il) {
  5082. struct ggml_tensor * inpSA = inpL;
  5083. // norm
  5084. cur = llm_build_norm(ctx0, inpL, hparams,
  5085. model.layers[il].attn_norm, NULL,
  5086. LLM_NORM_RMS, cb, il);
  5087. cb(cur, "attn_norm", il);
  5088. // self_attention
  5089. {
  5090. // compute Q and K and RoPE them
  5091. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  5092. cb(Qcur, "Qcur", il);
  5093. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5094. cb(Qcur, "Qcur", il);
  5095. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  5096. cb(Kcur, "Kcur", il);
  5097. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5098. cb(Kcur, "Kcur", il);
  5099. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  5100. cb(Vcur, "Vcur", il);
  5101. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5102. cb(Vcur, "Vcur", il);
  5103. Qcur = ggml_rope_ext(
  5104. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  5105. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5106. ext_factor, attn_factor, beta_fast, beta_slow
  5107. );
  5108. cb(Qcur, "Qcur", il);
  5109. Kcur = ggml_rope_ext(
  5110. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  5111. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5112. ext_factor, attn_factor, beta_fast, beta_slow
  5113. );
  5114. cb(Kcur, "Kcur", il);
  5115. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5116. model.layers[il].wo, model.layers[il].bo,
  5117. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5118. }
  5119. if (il == n_layer - 1) {
  5120. // skip computing output for unused tokens
  5121. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5122. n_tokens = n_outputs;
  5123. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5124. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5125. }
  5126. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5127. cb(ffn_inp, "ffn_inp", il);
  5128. // MoE branch
  5129. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5130. model.layers[il].ffn_norm, NULL,
  5131. LLM_NORM_RMS, cb, il);
  5132. cb(cur, "ffn_norm", il);
  5133. ggml_tensor * moe_out =
  5134. llm_build_moe_ffn(ctx0, lctx, cur,
  5135. model.layers[il].ffn_gate_inp,
  5136. model.layers[il].ffn_up_exps,
  5137. model.layers[il].ffn_gate_exps,
  5138. model.layers[il].ffn_down_exps,
  5139. nullptr,
  5140. n_expert, n_expert_used,
  5141. LLM_FFN_SILU, false,
  5142. false, 0.0,
  5143. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  5144. cb, il);
  5145. cb(cur, "ffn_moe_out", il);
  5146. // FFN shared expert
  5147. {
  5148. ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
  5149. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  5150. // sigmoid
  5151. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  5152. cb(cur_gate, "ffn_shexp_gate", il);
  5153. ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
  5154. model.layers[il].ffn_up_shexp, NULL, NULL,
  5155. model.layers[il].ffn_gate_shexp, NULL, NULL,
  5156. model.layers[il].ffn_down_shexp, NULL, NULL,
  5157. NULL,
  5158. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5159. cb(cur_ffn, "ffn_shexp", il);
  5160. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  5161. cb(ffn_shexp_out, "ffn_shexp_out", il);
  5162. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  5163. cb(moe_out, "ffn_out", il);
  5164. cur = moe_out;
  5165. }
  5166. cur = ggml_add(ctx0, cur, ffn_inp);
  5167. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5168. cb(cur, "l_out", il);
  5169. // input for next layer
  5170. inpL = cur;
  5171. }
  5172. cur = inpL;
  5173. cur = llm_build_norm(ctx0, cur, hparams,
  5174. model.output_norm, NULL,
  5175. LLM_NORM_RMS, cb, -1);
  5176. cb(cur, "result_norm", -1);
  5177. // lm_head
  5178. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5179. cb(cur, "result_output", -1);
  5180. ggml_build_forward_expand(gf, cur);
  5181. return gf;
  5182. }
  5183. struct ggml_cgraph * build_phi2() {
  5184. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5185. const int64_t n_embd_head = hparams.n_embd_head_v;
  5186. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5187. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5188. struct ggml_tensor * cur;
  5189. struct ggml_tensor * attn_norm_output;
  5190. struct ggml_tensor * ffn_output;
  5191. struct ggml_tensor * inpL;
  5192. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5193. // inp_pos - contains the positions
  5194. struct ggml_tensor * inp_pos = build_inp_pos();
  5195. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5196. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5197. for (int il = 0; il < n_layer; ++il) {
  5198. attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
  5199. model.layers[il].attn_norm,
  5200. model.layers[il].attn_norm_b,
  5201. LLM_NORM, cb, il);
  5202. cb(attn_norm_output, "attn_norm", il);
  5203. // self-attention
  5204. {
  5205. struct ggml_tensor * Qcur = nullptr;
  5206. struct ggml_tensor * Kcur = nullptr;
  5207. struct ggml_tensor * Vcur = nullptr;
  5208. if (model.layers[il].wqkv) {
  5209. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
  5210. cb(cur, "wqkv", il);
  5211. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5212. cb(cur, "bqkv", il);
  5213. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5214. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5215. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5216. } else {
  5217. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  5218. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  5219. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  5220. }
  5221. cb(Qcur, "Qcur", il);
  5222. cb(Kcur, "Kcur", il);
  5223. cb(Vcur, "Vcur", il);
  5224. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5225. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5226. Qcur = ggml_rope_ext(
  5227. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  5228. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  5229. );
  5230. cb(Qcur, "Qcur", il);
  5231. // with phi2, we scale the Q to avoid precision issues
  5232. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  5233. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  5234. cb(Qcur, "Qcur", il);
  5235. Kcur = ggml_rope_ext(
  5236. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  5237. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  5238. );
  5239. cb(Kcur, "Kcur", il);
  5240. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5241. model.layers[il].wo, model.layers[il].bo,
  5242. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  5243. }
  5244. if (il == n_layer - 1) {
  5245. // skip computing output for unused tokens
  5246. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5247. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5248. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5249. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  5250. }
  5251. // FF
  5252. {
  5253. ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
  5254. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5255. NULL, NULL, NULL,
  5256. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5257. NULL,
  5258. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  5259. cb(ffn_output, "ffn_out", il);
  5260. }
  5261. cur = ggml_add(ctx0, cur, ffn_output);
  5262. cur = ggml_add(ctx0, cur, inpL);
  5263. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5264. cb(cur, "l_out", il);
  5265. // input for next layer
  5266. inpL = cur;
  5267. }
  5268. cur = llm_build_norm(ctx0, inpL, hparams,
  5269. model.output_norm,
  5270. model.output_norm_b,
  5271. LLM_NORM, cb, -1);
  5272. cb(cur, "result_norm", -1);
  5273. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5274. cb(cur, "result_output_no_bias", -1);
  5275. cur = ggml_add(ctx0, cur, model.output_b);
  5276. cb(cur, "result_output", -1);
  5277. ggml_build_forward_expand(gf, cur);
  5278. return gf;
  5279. }
  5280. struct ggml_cgraph * build_phi3() {
  5281. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5282. const int64_t n_embd_head = hparams.n_embd_head_v;
  5283. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5284. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5285. struct ggml_tensor * cur;
  5286. struct ggml_tensor * inpL;
  5287. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5288. // inp_pos - contains the positions
  5289. struct ggml_tensor * inp_pos = build_inp_pos();
  5290. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5291. struct ggml_tensor * KQ_mask = nullptr;
  5292. if (hparams.n_swa == 0) {
  5293. // Phi-4 doesn't use sliding window attention
  5294. KQ_mask = build_inp_KQ_mask();
  5295. } else {
  5296. KQ_mask = build_inp_KQ_mask_swa();
  5297. }
  5298. for (int il = 0; il < n_layer; ++il) {
  5299. auto residual = inpL;
  5300. // self-attention
  5301. {
  5302. // rope freq factors for 128k context
  5303. struct ggml_tensor * rope_factors = build_rope_factors(il);
  5304. struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
  5305. model.layers[il].attn_norm,
  5306. NULL,
  5307. LLM_NORM_RMS, cb, il);
  5308. cb(attn_norm_output, "attn_norm", il);
  5309. struct ggml_tensor * Qcur = nullptr;
  5310. struct ggml_tensor * Kcur = nullptr;
  5311. struct ggml_tensor * Vcur = nullptr;
  5312. if (model.layers[il].wqkv) {
  5313. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
  5314. cb(cur, "wqkv", il);
  5315. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
  5316. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
  5317. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
  5318. }
  5319. else {
  5320. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  5321. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  5322. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  5323. }
  5324. cb(Qcur, "Qcur", il);
  5325. cb(Kcur, "Kcur", il);
  5326. cb(Vcur, "Vcur", il);
  5327. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5328. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  5329. Qcur = ggml_rope_ext(
  5330. ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
  5331. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  5332. );
  5333. cb(Qcur, "Qcur", il);
  5334. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  5335. cb(Qcur, "Qcur", il);
  5336. Kcur = ggml_rope_ext(
  5337. ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
  5338. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  5339. );
  5340. cb(Kcur, "Kcur", il);
  5341. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5342. model.layers[il].wo, model.layers[il].bo,
  5343. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  5344. }
  5345. if (il == n_layer - 1) {
  5346. // skip computing output for unused tokens
  5347. struct ggml_tensor* inp_out_ids = build_inp_out_ids();
  5348. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5349. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  5350. }
  5351. cur = ggml_add(ctx0, cur, residual);
  5352. residual = cur;
  5353. cur = llm_build_norm(ctx0, cur, hparams,
  5354. model.layers[il].ffn_norm, NULL,
  5355. LLM_NORM_RMS, cb, il);
  5356. cb(cur, "ffn_norm", il);
  5357. // FF
  5358. // special-case: the up and gate tensors are merged into a single tensor
  5359. // TOOD: support into llm_build_ffn
  5360. {
  5361. cur = llm_build_ffn(ctx0, lctx, cur,
  5362. model.layers[il].ffn_up, NULL, NULL,
  5363. NULL, NULL, NULL,
  5364. model.layers[il].ffn_down, NULL, NULL,
  5365. NULL,
  5366. LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
  5367. cb(cur, "ffn_out", il);
  5368. }
  5369. cur = ggml_add(ctx0, residual, cur);
  5370. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5371. cb(cur, "l_out", il);
  5372. // input for next layer
  5373. inpL = cur;
  5374. }
  5375. cur = llm_build_norm(ctx0, inpL, hparams,
  5376. model.output_norm,
  5377. NULL,
  5378. LLM_NORM_RMS, cb, -1);
  5379. cb(cur, "result_norm", -1);
  5380. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5381. cb(cur, "result_output", -1);
  5382. ggml_build_forward_expand(gf, cur);
  5383. return gf;
  5384. }
  5385. struct ggml_cgraph * build_plamo() {
  5386. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  5387. const int64_t n_embd_head = hparams.n_embd_head_v;
  5388. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5389. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5390. struct ggml_tensor * cur;
  5391. struct ggml_tensor * inpL;
  5392. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5393. // inp_pos - contains the positions
  5394. struct ggml_tensor * inp_pos = build_inp_pos();
  5395. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5396. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5397. for (int il = 0; il < n_layer; ++il) {
  5398. // norm
  5399. cur = llm_build_norm(ctx0, inpL, hparams,
  5400. model.layers[il].attn_norm, NULL,
  5401. LLM_NORM_RMS, cb, il);
  5402. cb(cur, "attn_norm", il);
  5403. struct ggml_tensor * attention_norm = cur;
  5404. // self-attention
  5405. {
  5406. // compute Q and K and RoPE them
  5407. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  5408. cb(Qcur, "Qcur", il);
  5409. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  5410. cb(Kcur, "Kcur", il);
  5411. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  5412. cb(Vcur, "Vcur", il);
  5413. Qcur = ggml_rope_ext(
  5414. ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
  5415. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  5416. ext_factor, attn_factor, beta_fast, beta_slow);
  5417. cb(Qcur, "Qcur", il);
  5418. Kcur = ggml_rope_ext(
  5419. ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
  5420. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  5421. ext_factor, attn_factor, beta_fast, beta_slow);
  5422. cb(Kcur, "Kcur", il);
  5423. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5424. model.layers[il].wo, NULL,
  5425. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5426. }
  5427. struct ggml_tensor * sa_out = cur;
  5428. cur = attention_norm;
  5429. if (il == n_layer - 1) {
  5430. // skip computing output for unused tokens
  5431. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5432. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5433. sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
  5434. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5435. }
  5436. // feed-forward network
  5437. {
  5438. cur = llm_build_ffn(ctx0, lctx, cur,
  5439. model.layers[il].ffn_up, NULL, NULL,
  5440. model.layers[il].ffn_gate, NULL, NULL,
  5441. model.layers[il].ffn_down, NULL, NULL,
  5442. NULL,
  5443. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5444. cb(cur, "ffn_out", il);
  5445. }
  5446. cur = ggml_add(ctx0, cur, sa_out);
  5447. cur = ggml_add(ctx0, cur, inpL);
  5448. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5449. cb(cur, "l_out", il);
  5450. // input for next layer
  5451. inpL = cur;
  5452. }
  5453. cur = inpL;
  5454. cur = llm_build_norm(ctx0, cur, hparams,
  5455. model.output_norm, NULL,
  5456. LLM_NORM_RMS, cb, -1);
  5457. cb(cur, "result_norm", -1);
  5458. // lm_head
  5459. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5460. cb(cur, "result_output", -1);
  5461. ggml_build_forward_expand(gf, cur);
  5462. return gf;
  5463. }
  5464. struct ggml_cgraph * build_gpt2() {
  5465. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5466. const int64_t n_embd_head = hparams.n_embd_head_v;
  5467. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5468. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5469. struct ggml_tensor * cur;
  5470. struct ggml_tensor * pos;
  5471. struct ggml_tensor * inpL;
  5472. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5473. // inp_pos - contains the positions
  5474. struct ggml_tensor * inp_pos = build_inp_pos();
  5475. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5476. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5477. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  5478. cb(pos, "pos_embd", -1);
  5479. inpL = ggml_add(ctx0, inpL, pos);
  5480. cb(inpL, "inpL", -1);
  5481. for (int il = 0; il < n_layer; ++il) {
  5482. cur = llm_build_norm(ctx0, inpL, hparams,
  5483. model.layers[il].attn_norm,
  5484. model.layers[il].attn_norm_b,
  5485. LLM_NORM, cb, il);
  5486. cb(cur, "attn_norm", il);
  5487. // self-attention
  5488. {
  5489. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  5490. cb(cur, "wqkv", il);
  5491. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5492. cb(cur, "bqkv", il);
  5493. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5494. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5495. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5496. cb(Qcur, "Qcur", il);
  5497. cb(Kcur, "Kcur", il);
  5498. cb(Vcur, "Vcur", il);
  5499. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  5500. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5501. model.layers[il].wo, model.layers[il].bo,
  5502. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5503. }
  5504. if (il == n_layer - 1) {
  5505. // skip computing output for unused tokens
  5506. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5507. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5508. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5509. }
  5510. // add the input
  5511. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5512. cb(ffn_inp, "ffn_inp", il);
  5513. // FF
  5514. {
  5515. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5516. model.layers[il].ffn_norm,
  5517. model.layers[il].ffn_norm_b,
  5518. LLM_NORM, cb, il);
  5519. cb(cur, "ffn_norm", il);
  5520. cur = llm_build_ffn(ctx0, lctx, cur,
  5521. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5522. NULL, NULL, NULL,
  5523. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5524. NULL,
  5525. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  5526. cb(cur, "ffn_out", il);
  5527. }
  5528. cur = ggml_add(ctx0, cur, ffn_inp);
  5529. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5530. cb(cur, "l_out", il);
  5531. // input for next layer
  5532. inpL = cur;
  5533. }
  5534. cur = llm_build_norm(ctx0, inpL, hparams,
  5535. model.output_norm,
  5536. model.output_norm_b,
  5537. LLM_NORM, cb, -1);
  5538. cb(cur, "result_norm", -1);
  5539. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5540. cb(cur, "result_output", -1);
  5541. ggml_build_forward_expand(gf, cur);
  5542. return gf;
  5543. }
  5544. struct ggml_cgraph * build_codeshell() {
  5545. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5546. const int64_t n_embd_head = hparams.n_embd_head_v;
  5547. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  5548. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5549. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5550. struct ggml_tensor * cur;
  5551. struct ggml_tensor * inpL;
  5552. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5553. // inp_pos - contains the positions
  5554. struct ggml_tensor * inp_pos = build_inp_pos();
  5555. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5556. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5557. for (int il = 0; il < n_layer; ++il) {
  5558. cur = llm_build_norm(ctx0, inpL, hparams,
  5559. model.layers[il].attn_norm,
  5560. model.layers[il].attn_norm_b,
  5561. LLM_NORM, cb, il);
  5562. cb(cur, "attn_norm", il);
  5563. // self-attention
  5564. {
  5565. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  5566. cb(cur, "wqkv", il);
  5567. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  5568. cb(cur, "bqkv", il);
  5569. struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  5570. struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  5571. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  5572. cb(tmpq, "tmpq", il);
  5573. cb(tmpk, "tmpk", il);
  5574. cb(Vcur, "Vcur", il);
  5575. struct ggml_tensor * Qcur = ggml_rope_ext(
  5576. ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  5577. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5578. ext_factor, attn_factor, beta_fast, beta_slow
  5579. );
  5580. cb(Qcur, "Qcur", il);
  5581. struct ggml_tensor * Kcur = ggml_rope_ext(
  5582. ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  5583. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5584. ext_factor, attn_factor, beta_fast, beta_slow
  5585. );
  5586. cb(Kcur, "Kcur", il);
  5587. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5588. model.layers[il].wo, model.layers[il].bo,
  5589. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5590. }
  5591. if (il == n_layer - 1) {
  5592. // skip computing output for unused tokens
  5593. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5594. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5595. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  5596. }
  5597. // add the input
  5598. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  5599. cb(ffn_inp, "ffn_inp", il);
  5600. // FF
  5601. {
  5602. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5603. model.layers[il].ffn_norm,
  5604. model.layers[il].ffn_norm_b,
  5605. LLM_NORM, cb, il);
  5606. cb(cur, "ffn_norm", il);
  5607. cur = llm_build_ffn(ctx0, lctx, cur,
  5608. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  5609. NULL, NULL, NULL,
  5610. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  5611. NULL,
  5612. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  5613. cb(cur, "ffn_out", il);
  5614. }
  5615. cur = ggml_add(ctx0, cur, ffn_inp);
  5616. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5617. cb(cur, "l_out", il);
  5618. // input for next layer
  5619. inpL = cur;
  5620. }
  5621. cur = llm_build_norm(ctx0, inpL, hparams,
  5622. model.output_norm,
  5623. model.output_norm_b,
  5624. LLM_NORM, cb, -1);
  5625. cb(cur, "result_norm", -1);
  5626. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5627. cb(cur, "result_output", -1);
  5628. ggml_build_forward_expand(gf, cur);
  5629. return gf;
  5630. }
  5631. struct ggml_cgraph * build_orion() {
  5632. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5633. const int64_t n_embd_head = hparams.n_embd_head_v;
  5634. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5635. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5636. struct ggml_tensor * cur;
  5637. struct ggml_tensor * inpL;
  5638. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5639. // inp_pos - contains the positions
  5640. struct ggml_tensor * inp_pos = build_inp_pos();
  5641. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5642. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5643. for (int il = 0; il < n_layer; ++il) {
  5644. struct ggml_tensor * inpSA = inpL;
  5645. // norm
  5646. cur = llm_build_norm(ctx0, inpL, hparams,
  5647. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  5648. LLM_NORM, cb, il);
  5649. cb(cur, "attn_norm", il);
  5650. // self-attention
  5651. {
  5652. // compute Q and K and RoPE them
  5653. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  5654. cb(Qcur, "Qcur", il);
  5655. // if (model.layers[il].bq) {
  5656. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5657. // cb(Qcur, "Qcur", il);
  5658. // }
  5659. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  5660. cb(Kcur, "Kcur", il);
  5661. // if (model.layers[il].bk) {
  5662. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5663. // cb(Kcur, "Kcur", il);
  5664. // }
  5665. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  5666. cb(Vcur, "Vcur", il);
  5667. // if (model.layers[il].bv) {
  5668. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5669. // cb(Vcur, "Vcur", il);
  5670. // }
  5671. Qcur = ggml_rope_ext(
  5672. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  5673. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5674. ext_factor, attn_factor, beta_fast, beta_slow
  5675. );
  5676. cb(Qcur, "Qcur", il);
  5677. Kcur = ggml_rope_ext(
  5678. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  5679. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5680. ext_factor, attn_factor, beta_fast, beta_slow
  5681. );
  5682. cb(Kcur, "Kcur", il);
  5683. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5684. model.layers[il].wo, NULL,
  5685. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5686. }
  5687. if (il == n_layer - 1) {
  5688. // skip computing output for unused tokens
  5689. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5690. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5691. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5692. }
  5693. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5694. cb(ffn_inp, "ffn_inp", il);
  5695. // feed-forward network
  5696. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5697. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  5698. LLM_NORM, cb, il);
  5699. cb(cur, "ffn_norm", il);
  5700. cur = llm_build_ffn(ctx0, lctx, cur,
  5701. model.layers[il].ffn_up, NULL, NULL,
  5702. model.layers[il].ffn_gate, NULL, NULL,
  5703. model.layers[il].ffn_down, NULL, NULL,
  5704. NULL,
  5705. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5706. cb(cur, "ffn_out", il);
  5707. cur = ggml_add(ctx0, cur, ffn_inp);
  5708. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5709. cb(cur, "l_out", il);
  5710. // input for next layer
  5711. inpL = cur;
  5712. }
  5713. cur = inpL;
  5714. cur = llm_build_norm(ctx0, cur, hparams,
  5715. model.output_norm, model.output_norm_b,
  5716. LLM_NORM, cb, -1);
  5717. cb(cur, "result_norm", -1);
  5718. // lm_head
  5719. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5720. cb(cur, "result_output", -1);
  5721. ggml_build_forward_expand(gf, cur);
  5722. return gf;
  5723. }
  5724. struct ggml_cgraph * build_internlm2() {
  5725. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5726. const int64_t n_embd_head = hparams.n_embd_head_v;
  5727. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  5728. GGML_ASSERT(n_embd_head == hparams.n_rot);
  5729. struct ggml_tensor * cur;
  5730. struct ggml_tensor * inpL;
  5731. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5732. // inp_pos - contains the positions
  5733. struct ggml_tensor * inp_pos = build_inp_pos();
  5734. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5735. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5736. for (int il = 0; il < n_layer; ++il) {
  5737. struct ggml_tensor * inpSA = inpL;
  5738. // norm
  5739. cur = llm_build_norm(ctx0, inpL, hparams,
  5740. model.layers[il].attn_norm, NULL,
  5741. LLM_NORM_RMS, cb, il);
  5742. cb(cur, "attn_norm", il);
  5743. // self-attention
  5744. {
  5745. // compute Q and K and RoPE them
  5746. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  5747. cb(Qcur, "Qcur", il);
  5748. if (model.layers[il].bq) {
  5749. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  5750. cb(Qcur, "Qcur", il);
  5751. }
  5752. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  5753. cb(Kcur, "Kcur", il);
  5754. if (model.layers[il].bk) {
  5755. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  5756. cb(Kcur, "Kcur", il);
  5757. }
  5758. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  5759. cb(Vcur, "Vcur", il);
  5760. if (model.layers[il].bv) {
  5761. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  5762. cb(Vcur, "Vcur", il);
  5763. }
  5764. Qcur = ggml_rope_ext(
  5765. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  5766. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5767. ext_factor, attn_factor, beta_fast, beta_slow
  5768. );
  5769. cb(Qcur, "Qcur", il);
  5770. Kcur = ggml_rope_ext(
  5771. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  5772. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5773. ext_factor, attn_factor, beta_fast, beta_slow
  5774. );
  5775. cb(Kcur, "Kcur", il);
  5776. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5777. model.layers[il].wo, model.layers[il].bo,
  5778. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  5779. }
  5780. if (il == n_layer - 1) {
  5781. // skip computing output for unused tokens
  5782. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5783. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5784. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5785. }
  5786. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5787. cb(ffn_inp, "ffn_inp", il);
  5788. // feed-forward network
  5789. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5790. model.layers[il].ffn_norm, NULL,
  5791. LLM_NORM_RMS, cb, il);
  5792. cb(cur, "ffn_norm", il);
  5793. cur = llm_build_ffn(ctx0, lctx, cur,
  5794. model.layers[il].ffn_up, NULL, NULL,
  5795. model.layers[il].ffn_gate, NULL, NULL,
  5796. model.layers[il].ffn_down, NULL, NULL,
  5797. NULL,
  5798. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5799. cb(cur, "ffn_out", il);
  5800. cur = ggml_add(ctx0, cur, ffn_inp);
  5801. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5802. cb(cur, "l_out", il);
  5803. // input for next layer
  5804. inpL = cur;
  5805. }
  5806. cur = inpL;
  5807. cur = llm_build_norm(ctx0, cur, hparams,
  5808. model.output_norm, NULL,
  5809. LLM_NORM_RMS, cb, -1);
  5810. cb(cur, "result_norm", -1);
  5811. // lm_head
  5812. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5813. cb(cur, "result_output", -1);
  5814. ggml_build_forward_expand(gf, cur);
  5815. return gf;
  5816. }
  5817. struct ggml_cgraph * build_minicpm3() {
  5818. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5819. //TODO: if the model varies, these parameters need to be read from the model
  5820. const int64_t n_embd_base = 256;
  5821. const float scale_embd = 12.0f;
  5822. const float scale_depth = 1.4f;
  5823. const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
  5824. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  5825. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  5826. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  5827. struct ggml_tensor * cur;
  5828. struct ggml_tensor * inpL;
  5829. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5830. // scale the input embeddings
  5831. inpL = ggml_scale(ctx0, inpL, scale_embd);
  5832. cb(inpL, "inp_scaled", -1);
  5833. // inp_pos - contains the positions
  5834. struct ggml_tensor * inp_pos = build_inp_pos();
  5835. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5836. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5837. for (int il = 0; il < n_layer; ++il) {
  5838. struct ggml_tensor * inpSA = inpL;
  5839. struct ggml_tensor * rope_factors = build_rope_factors(il);
  5840. // norm
  5841. cur = llm_build_norm(ctx0, inpL, hparams,
  5842. model.layers[il].attn_norm, NULL,
  5843. LLM_NORM_RMS, cb, il);
  5844. cb(cur, "attn_norm", il);
  5845. // self_attention
  5846. {
  5847. struct ggml_tensor * q = NULL;
  5848. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  5849. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  5850. cb(q, "q", il);
  5851. q = llm_build_norm(ctx0, q, hparams,
  5852. model.layers[il].attn_q_a_norm, NULL,
  5853. LLM_NORM_RMS, cb, il);
  5854. cb(q, "q", il);
  5855. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  5856. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  5857. cb(q, "q", il);
  5858. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  5859. struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  5860. ggml_row_size(q->type, hparams.n_embd_head_k),
  5861. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  5862. 0);
  5863. cb(q_nope, "q_nope", il);
  5864. // and {n_head * n_embd_head_qk_rope, n_tokens}
  5865. struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  5866. ggml_row_size(q->type, hparams.n_embd_head_k),
  5867. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  5868. ggml_row_size(q->type, n_embd_head_qk_nope));
  5869. cb(q_pe, "q_pe", il);
  5870. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  5871. struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  5872. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  5873. // split into {kv_lora_rank, n_tokens}
  5874. struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  5875. kv_pe_compresseed->nb[1],
  5876. 0);
  5877. cb(kv_compressed, "kv_compressed", il);
  5878. // and {n_embd_head_qk_rope, n_tokens}
  5879. struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  5880. kv_pe_compresseed->nb[1],
  5881. kv_pe_compresseed->nb[1],
  5882. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  5883. cb(k_pe, "k_pe", il);
  5884. kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
  5885. kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
  5886. model.layers[il].attn_kv_a_norm, NULL,
  5887. LLM_NORM_RMS, cb, il);
  5888. cb(kv_compressed, "kv_compressed", il);
  5889. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  5890. struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  5891. cb(kv, "kv", il);
  5892. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  5893. struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  5894. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  5895. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  5896. 0);
  5897. cb(k_nope, "k_nope", il);
  5898. // and {n_head * n_embd_head_v, n_tokens}
  5899. struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  5900. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  5901. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  5902. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  5903. cb(v_states, "v_states", il);
  5904. v_states = ggml_cont(ctx0, v_states);
  5905. cb(v_states, "v_states", il);
  5906. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  5907. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  5908. 0);
  5909. cb(v_states, "v_states", il);
  5910. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  5911. q_pe = ggml_rope_ext(
  5912. ctx0, q_pe, inp_pos, rope_factors,
  5913. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5914. ext_factor, attn_factor, beta_fast, beta_slow
  5915. );
  5916. cb(q_pe, "q_pe", il);
  5917. // shared RoPE key
  5918. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  5919. k_pe = ggml_rope_ext(
  5920. ctx0, k_pe, inp_pos, rope_factors,
  5921. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  5922. ext_factor, attn_factor, beta_fast, beta_slow
  5923. );
  5924. cb(k_pe, "k_pe", il);
  5925. struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  5926. cb(q_states, "q_states", il);
  5927. struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  5928. cb(k_states, "k_states", il);
  5929. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  5930. model.layers[il].wo, NULL,
  5931. k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  5932. }
  5933. if (il == n_layer - 1) {
  5934. // skip computing output for unused tokens
  5935. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  5936. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  5937. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  5938. }
  5939. // scale_res - scale the hidden states for residual connection
  5940. const float scale_res = scale_depth/sqrtf(float(n_layer));
  5941. cur = ggml_scale(ctx0, cur, scale_res);
  5942. cb(cur, "hidden_scaled", il);
  5943. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  5944. cb(ffn_inp, "ffn_inp", il);
  5945. // feed-forward network
  5946. {
  5947. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  5948. model.layers[il].ffn_norm, NULL,
  5949. LLM_NORM_RMS, cb, il);
  5950. cb(cur, "ffn_norm", il);
  5951. cur = llm_build_ffn(ctx0, lctx, cur,
  5952. model.layers[il].ffn_up, NULL, NULL,
  5953. model.layers[il].ffn_gate, NULL, NULL,
  5954. model.layers[il].ffn_down, NULL, NULL,
  5955. NULL,
  5956. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  5957. cb(cur, "ffn_out", il);
  5958. }
  5959. // scale the hidden states for residual connection
  5960. cur = ggml_scale(ctx0, cur, scale_res);
  5961. cb(cur, "hidden_scaled_ffn", il);
  5962. cur = ggml_add(ctx0, cur, ffn_inp);
  5963. cur = lctx.cvec.apply_to(ctx0, cur, il);
  5964. cb(cur, "l_out", il);
  5965. // input for next layer
  5966. inpL = cur;
  5967. }
  5968. cur = inpL;
  5969. cur = llm_build_norm(ctx0, cur, hparams,
  5970. model.output_norm, NULL,
  5971. LLM_NORM_RMS, cb, -1);
  5972. cb(cur, "result_norm", -1);
  5973. // lm_head scaling
  5974. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  5975. cur = ggml_scale(ctx0, cur, scale_lmhead);
  5976. cb(cur, "lmhead_scaling", -1);
  5977. // lm_head
  5978. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  5979. cb(cur, "result_output", -1);
  5980. ggml_build_forward_expand(gf, cur);
  5981. return gf;
  5982. }
  5983. struct ggml_cgraph * build_gemma() {
  5984. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  5985. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  5986. struct ggml_tensor * cur;
  5987. struct ggml_tensor * inpL;
  5988. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  5989. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  5990. cb(inpL, "inp_scaled", -1);
  5991. // inp_pos - contains the positions
  5992. struct ggml_tensor * inp_pos = build_inp_pos();
  5993. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  5994. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  5995. for (int il = 0; il < n_layer; ++il) {
  5996. // norm
  5997. cur = llm_build_norm(ctx0, inpL, hparams,
  5998. model.layers[il].attn_norm, NULL,
  5999. LLM_NORM_RMS, cb, il);
  6000. cb(cur, "attn_norm", il);
  6001. // self-attention
  6002. {
  6003. // compute Q and K and RoPE them
  6004. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6005. cb(Qcur, "Qcur", il);
  6006. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6007. cb(Kcur, "Kcur", il);
  6008. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6009. cb(Vcur, "Vcur", il);
  6010. Qcur = ggml_rope_ext(
  6011. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
  6012. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6013. ext_factor, attn_factor, beta_fast, beta_slow);
  6014. cb(Qcur, "Qcur", il);
  6015. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
  6016. cb(Qcur, "Qcur_scaled", il);
  6017. Kcur = ggml_rope_ext(
  6018. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
  6019. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6020. ext_factor, attn_factor, beta_fast, beta_slow);
  6021. cb(Kcur, "Kcur", il);
  6022. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6023. model.layers[il].wo, NULL,
  6024. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  6025. }
  6026. if (il == n_layer - 1) {
  6027. // skip computing output for unused tokens
  6028. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6029. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6030. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6031. }
  6032. struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6033. cb(sa_out, "sa_out", il);
  6034. cur = llm_build_norm(ctx0, sa_out, hparams,
  6035. model.layers[il].ffn_norm, NULL,
  6036. LLM_NORM_RMS, cb, il);
  6037. cb(cur, "ffn_norm", il);
  6038. // feed-forward network
  6039. {
  6040. cur = llm_build_ffn(ctx0, lctx, cur,
  6041. model.layers[il].ffn_up, NULL, NULL,
  6042. model.layers[il].ffn_gate, NULL, NULL,
  6043. model.layers[il].ffn_down, NULL, NULL,
  6044. NULL,
  6045. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  6046. cb(cur, "ffn_out", il);
  6047. }
  6048. cur = ggml_add(ctx0, cur, sa_out);
  6049. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6050. cb(cur, "l_out", il);
  6051. // input for next layer
  6052. inpL = cur;
  6053. }
  6054. cur = inpL;
  6055. cur = llm_build_norm(ctx0, cur, hparams,
  6056. model.output_norm, NULL,
  6057. LLM_NORM_RMS, cb, -1);
  6058. cb(cur, "result_norm", -1);
  6059. // lm_head
  6060. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6061. cb(cur, "result_output", -1);
  6062. ggml_build_forward_expand(gf, cur);
  6063. return gf;
  6064. }
  6065. struct ggml_cgraph * build_gemma2() {
  6066. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6067. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  6068. struct ggml_tensor * cur;
  6069. struct ggml_tensor * inpL;
  6070. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6071. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  6072. cb(inpL, "inp_scaled", -1);
  6073. // inp_pos - contains the positions
  6074. struct ggml_tensor * inp_pos = build_inp_pos();
  6075. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6076. // gemma 2 requires different mask for layers using sliding window (SWA)
  6077. struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
  6078. struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
  6079. for (int il = 0; il < n_layer; ++il) {
  6080. // (il % 2) layers use SWA
  6081. struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
  6082. // norm
  6083. cur = llm_build_norm(ctx0, inpL, hparams,
  6084. model.layers[il].attn_norm, NULL,
  6085. LLM_NORM_RMS, cb, il);
  6086. cb(cur, "attn_norm", il);
  6087. // self-attention
  6088. {
  6089. // compute Q and K and RoPE them
  6090. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6091. cb(Qcur, "Qcur", il);
  6092. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6093. cb(Kcur, "Kcur", il);
  6094. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6095. cb(Vcur, "Vcur", il);
  6096. Qcur = ggml_rope_ext(
  6097. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
  6098. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6099. ext_factor, attn_factor, beta_fast, beta_slow);
  6100. cb(Qcur, "Qcur", il);
  6101. // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
  6102. switch (model.type) {
  6103. case llm_type::MODEL_2B:
  6104. case llm_type::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
  6105. case llm_type::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
  6106. default: GGML_ABORT("fatal error");
  6107. };
  6108. cb(Qcur, "Qcur_scaled", il);
  6109. Kcur = ggml_rope_ext(
  6110. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
  6111. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6112. ext_factor, attn_factor, beta_fast, beta_slow);
  6113. cb(Kcur, "Kcur", il);
  6114. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6115. model.layers[il].wo, NULL,
  6116. Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  6117. }
  6118. cur = llm_build_norm(ctx0, cur, hparams,
  6119. model.layers[il].attn_post_norm, NULL,
  6120. LLM_NORM_RMS, cb, il);
  6121. cb(cur, "attn_post_norm", il);
  6122. if (il == n_layer - 1) {
  6123. // skip computing output for unused tokens
  6124. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6125. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6126. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6127. }
  6128. struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  6129. cb(sa_out, "sa_out", il);
  6130. cur = llm_build_norm(ctx0, sa_out, hparams,
  6131. model.layers[il].ffn_norm, NULL,
  6132. LLM_NORM_RMS, cb, il);
  6133. cb(cur, "ffn_norm", il);
  6134. // feed-forward network
  6135. {
  6136. cur = llm_build_ffn(ctx0, lctx, cur,
  6137. model.layers[il].ffn_up, NULL, NULL,
  6138. model.layers[il].ffn_gate, NULL, NULL,
  6139. model.layers[il].ffn_down, NULL, NULL,
  6140. NULL,
  6141. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  6142. cb(cur, "ffn_out", il);
  6143. }
  6144. cur = llm_build_norm(ctx0, cur, hparams,
  6145. model.layers[il].ffn_post_norm, NULL,
  6146. LLM_NORM_RMS, cb, -1);
  6147. cb(cur, "ffn_post_norm", -1);
  6148. cur = ggml_add(ctx0, cur, sa_out);
  6149. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6150. cb(cur, "l_out", il);
  6151. // input for next layer
  6152. inpL = cur;
  6153. }
  6154. cur = inpL;
  6155. cur = llm_build_norm(ctx0, cur, hparams,
  6156. model.output_norm, NULL,
  6157. LLM_NORM_RMS, cb, -1);
  6158. cb(cur, "result_norm", -1);
  6159. // lm_head
  6160. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6161. // final logit soft-capping
  6162. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  6163. cur = ggml_tanh(ctx0, cur);
  6164. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  6165. cb(cur, "result_output", -1);
  6166. ggml_build_forward_expand(gf, cur);
  6167. return gf;
  6168. }
  6169. struct ggml_cgraph * build_starcoder2() {
  6170. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6171. const int64_t n_embd_head = hparams.n_embd_head_v;
  6172. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6173. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6174. struct ggml_tensor * cur;
  6175. struct ggml_tensor * inpL;
  6176. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6177. // inp_pos - contains the positions
  6178. struct ggml_tensor * inp_pos = build_inp_pos();
  6179. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6180. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6181. for (int il = 0; il < n_layer; ++il) {
  6182. struct ggml_tensor * inpSA = inpL;
  6183. // norm
  6184. cur = llm_build_norm(ctx0, inpL, hparams,
  6185. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  6186. LLM_NORM, cb, il);
  6187. cb(cur, "attn_norm", il);
  6188. // self-attention
  6189. {
  6190. // compute Q and K and RoPE them
  6191. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6192. cb(Qcur, "Qcur", il);
  6193. if (model.layers[il].bq) {
  6194. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6195. cb(Qcur, "Qcur", il);
  6196. }
  6197. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6198. cb(Kcur, "Kcur", il);
  6199. if (model.layers[il].bk) {
  6200. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6201. cb(Kcur, "Kcur", il);
  6202. }
  6203. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6204. cb(Vcur, "Vcur", il);
  6205. if (model.layers[il].bv) {
  6206. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6207. cb(Vcur, "Vcur", il);
  6208. }
  6209. Qcur = ggml_rope_ext(
  6210. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  6211. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6212. ext_factor, attn_factor, beta_fast, beta_slow
  6213. );
  6214. cb(Qcur, "Qcur", il);
  6215. Kcur = ggml_rope_ext(
  6216. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  6217. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6218. ext_factor, attn_factor, beta_fast, beta_slow
  6219. );
  6220. cb(Kcur, "Kcur", il);
  6221. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6222. model.layers[il].wo, model.layers[il].bo,
  6223. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6224. }
  6225. if (il == n_layer - 1) {
  6226. // skip computing output for unused tokens
  6227. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6228. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6229. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6230. }
  6231. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6232. cb(ffn_inp, "ffn_inp", il);
  6233. // feed-forward network
  6234. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  6235. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  6236. LLM_NORM, cb, il);
  6237. cb(cur, "ffn_norm", il);
  6238. cur = llm_build_ffn(ctx0, lctx, cur,
  6239. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6240. NULL, NULL, NULL,
  6241. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6242. NULL,
  6243. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  6244. cb(cur, "ffn_out", il);
  6245. cur = ggml_add(ctx0, cur, ffn_inp);
  6246. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6247. cb(cur, "l_out", il);
  6248. // input for next layer
  6249. inpL = cur;
  6250. }
  6251. cur = inpL;
  6252. cur = llm_build_norm(ctx0, cur, hparams,
  6253. model.output_norm, model.output_norm_b,
  6254. LLM_NORM, cb, -1);
  6255. cb(cur, "result_norm", -1);
  6256. // lm_head
  6257. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6258. cb(cur, "result_output", -1);
  6259. ggml_build_forward_expand(gf, cur);
  6260. return gf;
  6261. }
  6262. struct ggml_cgraph * build_mamba() {
  6263. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6264. struct ggml_tensor * cur;
  6265. struct ggml_tensor * inpL;
  6266. // {n_embd, n_tokens}
  6267. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6268. struct ggml_tensor * state_copy = build_inp_s_copy();
  6269. struct ggml_tensor * state_mask = build_inp_s_mask();
  6270. for (int il = 0; il < n_layer; ++il) {
  6271. // norm
  6272. cur = llm_build_norm(ctx0, inpL, hparams,
  6273. model.layers[il].attn_norm, NULL,
  6274. LLM_NORM_RMS, cb, il);
  6275. cb(cur, "attn_norm", il);
  6276. cur = llm_build_mamba(ctx0, lctx, ubatch, gf, cur,
  6277. state_copy, state_mask,
  6278. kv_head, n_kv, cb, il);
  6279. if (il == n_layer - 1) {
  6280. // skip computing output for unused tokens
  6281. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6282. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6283. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6284. }
  6285. // residual
  6286. cur = ggml_add(ctx0, cur, inpL);
  6287. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6288. cb(cur, "l_out", il);
  6289. // input for next layer
  6290. inpL = cur;
  6291. }
  6292. // final rmsnorm
  6293. cur = llm_build_norm(ctx0, inpL, hparams,
  6294. model.output_norm, NULL,
  6295. LLM_NORM_RMS, cb, -1);
  6296. cb(cur, "result_norm", -1);
  6297. // lm_head
  6298. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6299. cb(cur, "result_output", -1);
  6300. ggml_build_forward_expand(gf, cur);
  6301. return gf;
  6302. }
  6303. struct ggml_cgraph * build_command_r() {
  6304. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6305. const int64_t n_embd_head = hparams.n_embd_head_v;
  6306. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6307. const float f_logit_scale = hparams.f_logit_scale;
  6308. struct ggml_tensor * cur;
  6309. struct ggml_tensor * inpL;
  6310. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6311. // inp_pos - contains the positions
  6312. struct ggml_tensor * inp_pos = build_inp_pos();
  6313. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6314. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6315. for (int il = 0; il < n_layer; ++il) {
  6316. // norm
  6317. cur = llm_build_norm(ctx0, inpL, hparams,
  6318. model.layers[il].attn_norm, NULL,
  6319. LLM_NORM, cb, il);
  6320. cb(cur, "attn_norm", il);
  6321. struct ggml_tensor * ffn_inp = cur;
  6322. // self-attention
  6323. {
  6324. // compute Q and K and RoPE them
  6325. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6326. cb(Qcur, "Qcur", il);
  6327. if (model.layers[il].bq) {
  6328. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6329. cb(Qcur, "Qcur", il);
  6330. }
  6331. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6332. cb(Kcur, "Kcur", il);
  6333. if (model.layers[il].bk) {
  6334. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6335. cb(Kcur, "Kcur", il);
  6336. }
  6337. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6338. cb(Vcur, "Vcur", il);
  6339. if (model.layers[il].bv) {
  6340. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6341. cb(Vcur, "Vcur", il);
  6342. }
  6343. if (model.layers[il].attn_q_norm) {
  6344. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  6345. ggml_element_size(Qcur) * n_embd_head,
  6346. ggml_element_size(Qcur) * n_embd_head * n_head,
  6347. 0);
  6348. cb(Qcur, "Qcur", il);
  6349. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  6350. ggml_element_size(Kcur) * n_embd_head,
  6351. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  6352. 0);
  6353. cb(Kcur, "Kcur", il);
  6354. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  6355. model.layers[il].attn_q_norm,
  6356. NULL,
  6357. LLM_NORM, cb, il);
  6358. cb(Qcur, "Qcur", il);
  6359. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  6360. model.layers[il].attn_k_norm,
  6361. NULL,
  6362. LLM_NORM, cb, il);
  6363. cb(Kcur, "Kcur", il);
  6364. }
  6365. Qcur = ggml_rope_ext(
  6366. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  6367. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6368. ext_factor, attn_factor, beta_fast, beta_slow
  6369. );
  6370. cb(Qcur, "Qcur", il);
  6371. Kcur = ggml_rope_ext(
  6372. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  6373. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6374. ext_factor, attn_factor, beta_fast, beta_slow
  6375. );
  6376. cb(Kcur, "Kcur", il);
  6377. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6378. model.layers[il].wo, model.layers[il].bo,
  6379. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6380. }
  6381. if (il == n_layer - 1) {
  6382. // skip computing output for unused tokens
  6383. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6384. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6385. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6386. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  6387. }
  6388. struct ggml_tensor * attn_out = cur;
  6389. // feed-forward network
  6390. {
  6391. cur = llm_build_ffn(ctx0, lctx, ffn_inp,
  6392. model.layers[il].ffn_up, NULL, NULL,
  6393. model.layers[il].ffn_gate, NULL, NULL,
  6394. model.layers[il].ffn_down, NULL, NULL,
  6395. NULL,
  6396. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  6397. cb(cur, "ffn_out", il);
  6398. }
  6399. // add together residual + FFN + self-attention
  6400. cur = ggml_add(ctx0, cur, inpL);
  6401. cur = ggml_add(ctx0, cur, attn_out);
  6402. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6403. cb(cur, "l_out", il);
  6404. // input for next layer
  6405. inpL = cur;
  6406. }
  6407. cur = inpL;
  6408. cur = llm_build_norm(ctx0, cur, hparams,
  6409. model.output_norm, NULL,
  6410. LLM_NORM, cb, -1);
  6411. cb(cur, "result_norm", -1);
  6412. // lm_head
  6413. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6414. if (f_logit_scale) {
  6415. cur = ggml_scale(ctx0, cur, f_logit_scale);
  6416. }
  6417. cb(cur, "result_output", -1);
  6418. ggml_build_forward_expand(gf, cur);
  6419. return gf;
  6420. }
  6421. struct ggml_cgraph * build_cohere2() {
  6422. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6423. const int64_t n_embd_head = hparams.n_embd_head_v;
  6424. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6425. const float f_logit_scale = hparams.f_logit_scale;
  6426. struct ggml_tensor * cur;
  6427. struct ggml_tensor * inpL;
  6428. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6429. // inp_pos - contains the positions
  6430. struct ggml_tensor * inp_pos = build_inp_pos();
  6431. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6432. // cohere2 requires different mask for layers using sliding window (SWA)
  6433. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6434. struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
  6435. // sliding window switch pattern
  6436. const int32_t sliding_window_pattern = 4;
  6437. for (int il = 0; il < n_layer; ++il) {
  6438. // three layers sliding window attention (window size 4096) and ROPE
  6439. // fourth layer uses global attention without positional embeddings
  6440. const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1);
  6441. struct ggml_tensor * KQ_mask_l = is_sliding ? KQ_mask_swa : KQ_mask;
  6442. // norm
  6443. cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, NULL, LLM_NORM, cb, il);
  6444. cb(cur, "attn_norm", il);
  6445. struct ggml_tensor * ffn_inp = cur;
  6446. // self-attention
  6447. {
  6448. // rope freq factors for 128k context
  6449. struct ggml_tensor * rope_factors = build_rope_factors(il);
  6450. // compute Q and K and RoPE them
  6451. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6452. cb(Qcur, "Qcur", il);
  6453. if (model.layers[il].bq) {
  6454. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  6455. cb(Qcur, "Qcur", il);
  6456. }
  6457. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6458. cb(Kcur, "Kcur", il);
  6459. if (model.layers[il].bk) {
  6460. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  6461. cb(Kcur, "Kcur", il);
  6462. }
  6463. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6464. cb(Vcur, "Vcur", il);
  6465. if (model.layers[il].bv) {
  6466. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  6467. cb(Vcur, "Vcur", il);
  6468. }
  6469. if (is_sliding) {
  6470. Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  6471. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
  6472. beta_fast, beta_slow);
  6473. cb(Qcur, "Qcur", il);
  6474. Kcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
  6475. rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
  6476. attn_factor, beta_fast, beta_slow);
  6477. cb(Kcur, "Kcur", il);
  6478. } else {
  6479. // For non-sliding layers, just reshape without applying RoPE
  6480. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6481. cb(Qcur, "Qcur", il);
  6482. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6483. cb(Kcur, "Kcur", il);
  6484. }
  6485. cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur,
  6486. KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
  6487. }
  6488. if (il == n_layer - 1) {
  6489. // skip computing output for unused tokens
  6490. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6491. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6492. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6493. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  6494. }
  6495. struct ggml_tensor * attn_out = cur;
  6496. // feed-forward network
  6497. {
  6498. cur = llm_build_ffn(ctx0, lctx, ffn_inp, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate,
  6499. NULL, NULL, model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR,
  6500. cb, il);
  6501. cb(cur, "ffn_out", il);
  6502. }
  6503. // add together residual + FFN + self-attention
  6504. cur = ggml_add(ctx0, cur, inpL);
  6505. cur = ggml_add(ctx0, cur, attn_out);
  6506. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6507. cb(cur, "l_out", il);
  6508. // input for next layer
  6509. inpL = cur;
  6510. }
  6511. cur = inpL;
  6512. cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM, cb, -1);
  6513. cb(cur, "result_norm", -1);
  6514. // lm_head
  6515. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6516. if (f_logit_scale) {
  6517. cur = ggml_scale(ctx0, cur, f_logit_scale);
  6518. }
  6519. cb(cur, "result_output", -1);
  6520. ggml_build_forward_expand(gf, cur);
  6521. return gf;
  6522. }
  6523. // ref: https://allenai.org/olmo
  6524. // based on the original build_llama() function, changes:
  6525. // * non-parametric layer norm
  6526. // * clamp qkv
  6527. // * removed bias
  6528. // * removed MoE
  6529. struct ggml_cgraph * build_olmo() {
  6530. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6531. // mutable variable, needed during the last layer of the computation to skip unused tokens
  6532. int32_t n_tokens = this->n_tokens;
  6533. const int64_t n_embd_head = hparams.n_embd_head_v;
  6534. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6535. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6536. struct ggml_tensor * cur;
  6537. struct ggml_tensor * inpL;
  6538. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6539. // inp_pos - contains the positions
  6540. struct ggml_tensor * inp_pos = build_inp_pos();
  6541. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6542. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6543. for (int il = 0; il < n_layer; ++il) {
  6544. struct ggml_tensor * inpSA = inpL;
  6545. // norm
  6546. cur = llm_build_norm(ctx0, inpL, hparams,
  6547. NULL, NULL,
  6548. LLM_NORM, cb, il);
  6549. cb(cur, "attn_norm", il);
  6550. // self-attention
  6551. {
  6552. // compute Q and K and RoPE them
  6553. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6554. cb(Qcur, "Qcur", il);
  6555. if (hparams.f_clamp_kqv > 0.0f) {
  6556. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6557. cb(Qcur, "Qcur", il);
  6558. }
  6559. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6560. cb(Kcur, "Kcur", il);
  6561. if (hparams.f_clamp_kqv > 0.0f) {
  6562. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6563. cb(Kcur, "Kcur", il);
  6564. }
  6565. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6566. cb(Vcur, "Vcur", il);
  6567. if (hparams.f_clamp_kqv > 0.0f) {
  6568. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  6569. cb(Vcur, "Vcur", il);
  6570. }
  6571. Qcur = ggml_rope_ext(
  6572. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  6573. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6574. ext_factor, attn_factor, beta_fast, beta_slow
  6575. );
  6576. cb(Qcur, "Qcur", il);
  6577. Kcur = ggml_rope_ext(
  6578. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  6579. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6580. ext_factor, attn_factor, beta_fast, beta_slow
  6581. );
  6582. cb(Kcur, "Kcur", il);
  6583. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6584. model.layers[il].wo, nullptr,
  6585. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6586. }
  6587. if (il == n_layer - 1) {
  6588. // skip computing output for unused tokens
  6589. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6590. n_tokens = n_outputs;
  6591. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6592. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6593. }
  6594. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6595. cb(ffn_inp, "ffn_inp", il);
  6596. // feed-forward network
  6597. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  6598. NULL, NULL,
  6599. LLM_NORM, cb, il);
  6600. cb(cur, "ffn_norm", il);
  6601. cur = llm_build_ffn(ctx0, lctx, cur,
  6602. model.layers[il].ffn_up, NULL, NULL,
  6603. model.layers[il].ffn_gate, NULL, NULL,
  6604. model.layers[il].ffn_down, NULL, NULL,
  6605. NULL,
  6606. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  6607. cb(cur, "ffn_out", il);
  6608. cur = ggml_add(ctx0, cur, ffn_inp);
  6609. cb(cur, "ffn_out", il);
  6610. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6611. cb(cur, "l_out", il);
  6612. // input for next layer
  6613. inpL = cur;
  6614. }
  6615. cur = inpL;
  6616. cur = llm_build_norm(ctx0, cur, hparams,
  6617. NULL, NULL,
  6618. LLM_NORM, cb, -1);
  6619. cb(cur, "result_norm", -1);
  6620. // lm_head
  6621. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6622. cb(cur, "result_output", -1);
  6623. ggml_build_forward_expand(gf, cur);
  6624. return gf;
  6625. }
  6626. struct ggml_cgraph * build_olmo2() {
  6627. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6628. // mutable variable, needed during the last layer of the computation to skip unused tokens
  6629. int32_t n_tokens = this->n_tokens;
  6630. const int64_t n_embd_head = hparams.n_embd_head_v;
  6631. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6632. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6633. struct ggml_tensor * cur;
  6634. struct ggml_tensor * inpL;
  6635. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6636. // inp_pos - contains the positions
  6637. struct ggml_tensor * inp_pos = build_inp_pos();
  6638. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6639. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6640. for (int il = 0; il < n_layer; ++il) {
  6641. struct ggml_tensor * inpSA = inpL;
  6642. cur = inpL;
  6643. // self_attention
  6644. {
  6645. // compute Q and K and RoPE them
  6646. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6647. cb(Qcur, "Qcur", il);
  6648. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6649. cb(Kcur, "Kcur", il);
  6650. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6651. cb(Vcur, "Vcur", il);
  6652. Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
  6653. LLM_NORM_RMS, cb, il);
  6654. cb(Qcur, "Qcur_normed", il);
  6655. Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
  6656. LLM_NORM_RMS, cb, il);
  6657. cb(Kcur, "Kcur_normed", il);
  6658. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6659. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6660. Qcur = ggml_rope_ext(
  6661. ctx0, Qcur, inp_pos, nullptr,
  6662. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6663. ext_factor, attn_factor, beta_fast, beta_slow
  6664. );
  6665. cb(Qcur, "Qcur_rope", il);
  6666. Kcur = ggml_rope_ext(
  6667. ctx0, Kcur, inp_pos, nullptr,
  6668. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6669. ext_factor, attn_factor, beta_fast, beta_slow
  6670. );
  6671. cb(Kcur, "Kcur_rope", il);
  6672. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6673. model.layers[il].wo, NULL,
  6674. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6675. }
  6676. cur = llm_build_norm(ctx0, cur, hparams,
  6677. model.layers[il].attn_post_norm, NULL,
  6678. LLM_NORM_RMS, cb, il);
  6679. cb(cur, "attn_post_norm", il);
  6680. if (il == n_layer - 1) {
  6681. // skip computing output for unused tokens
  6682. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6683. n_tokens = n_outputs;
  6684. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6685. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6686. }
  6687. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6688. cb(ffn_inp, "ffn_inp", il);
  6689. // feed-forward network
  6690. cur = llm_build_ffn(ctx0, lctx, ffn_inp,
  6691. model.layers[il].ffn_up, NULL, NULL,
  6692. model.layers[il].ffn_gate, NULL, NULL,
  6693. model.layers[il].ffn_down, NULL, NULL,
  6694. NULL,
  6695. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  6696. cb(cur, "ffn_out", il);
  6697. cur = llm_build_norm(ctx0, cur, hparams,
  6698. model.layers[il].ffn_post_norm, NULL,
  6699. LLM_NORM_RMS, cb, -1);
  6700. cb(cur, "ffn_post_norm", -1);
  6701. cur = ggml_add(ctx0, cur, ffn_inp);
  6702. cb(cur, "ffn_out", il);
  6703. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6704. cb(cur, "l_out", il);
  6705. // input for next layer
  6706. inpL = cur;
  6707. }
  6708. cur = inpL;
  6709. cur = llm_build_norm(ctx0, cur, hparams,
  6710. model.output_norm, NULL,
  6711. LLM_NORM_RMS, cb, -1);
  6712. cb(cur, "result_norm", -1);
  6713. // lm_head
  6714. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6715. cb(cur, "result_output", -1);
  6716. ggml_build_forward_expand(gf, cur);
  6717. return gf;
  6718. }
  6719. // based on the build_qwen2moe() function, changes:
  6720. // * removed shared experts
  6721. // * removed bias
  6722. // * added q, k norm
  6723. struct ggml_cgraph * build_olmoe() {
  6724. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6725. // mutable variable, needed during the last layer of the computation to skip unused tokens
  6726. int32_t n_tokens = this->n_tokens;
  6727. const int64_t n_embd_head = hparams.n_embd_head_v;
  6728. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6729. GGML_ASSERT(n_embd_head == hparams.n_rot);
  6730. struct ggml_tensor * cur;
  6731. struct ggml_tensor * inpL;
  6732. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6733. // inp_pos - contains the positions
  6734. struct ggml_tensor * inp_pos = build_inp_pos();
  6735. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6736. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6737. for (int il = 0; il < n_layer; ++il) {
  6738. struct ggml_tensor * inpSA = inpL;
  6739. // norm
  6740. cur = llm_build_norm(ctx0, inpL, hparams,
  6741. model.layers[il].attn_norm, NULL,
  6742. LLM_NORM_RMS, cb, il);
  6743. cb(cur, "attn_norm", il);
  6744. // self_attention
  6745. {
  6746. // compute Q and K and RoPE them
  6747. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  6748. cb(Qcur, "Qcur", il);
  6749. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  6750. cb(Kcur, "Kcur", il);
  6751. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  6752. cb(Vcur, "Vcur", il);
  6753. Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
  6754. LLM_NORM_RMS, cb, il);
  6755. cb(Qcur, "Qcur_normed", il);
  6756. Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
  6757. LLM_NORM_RMS, cb, il);
  6758. cb(Kcur, "Kcur_normed", il);
  6759. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  6760. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  6761. Qcur = ggml_rope_ext(
  6762. ctx0, Qcur, inp_pos, nullptr,
  6763. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6764. ext_factor, attn_factor, beta_fast, beta_slow
  6765. );
  6766. cb(Qcur, "Qcur_rope", il);
  6767. Kcur = ggml_rope_ext(
  6768. ctx0, Kcur, inp_pos, nullptr,
  6769. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6770. ext_factor, attn_factor, beta_fast, beta_slow
  6771. );
  6772. cb(Kcur, "Kcur_rope", il);
  6773. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6774. model.layers[il].wo, NULL,
  6775. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6776. }
  6777. if (il == n_layer - 1) {
  6778. // skip computing output for unused tokens
  6779. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6780. n_tokens = n_outputs;
  6781. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6782. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  6783. }
  6784. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  6785. cb(ffn_inp, "ffn_inp", il);
  6786. // MoE branch
  6787. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  6788. model.layers[il].ffn_norm, NULL,
  6789. LLM_NORM_RMS, cb, il);
  6790. cb(cur, "ffn_norm", il);
  6791. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  6792. model.layers[il].ffn_gate_inp,
  6793. model.layers[il].ffn_up_exps,
  6794. model.layers[il].ffn_gate_exps,
  6795. model.layers[il].ffn_down_exps,
  6796. nullptr,
  6797. n_expert, n_expert_used,
  6798. LLM_FFN_SILU, false,
  6799. false, 0.0,
  6800. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  6801. cb, il);
  6802. cb(cur, "ffn_moe_out", il);
  6803. cur = ggml_add(ctx0, cur, ffn_inp);
  6804. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6805. cb(cur, "l_out", il);
  6806. // input for next layer
  6807. inpL = cur;
  6808. }
  6809. cur = inpL;
  6810. cur = llm_build_norm(ctx0, cur, hparams,
  6811. model.output_norm, NULL,
  6812. LLM_NORM_RMS, cb, -1);
  6813. cb(cur, "result_norm", -1);
  6814. // lm_head
  6815. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6816. cb(cur, "result_output", -1);
  6817. ggml_build_forward_expand(gf, cur);
  6818. return gf;
  6819. }
  6820. struct ggml_cgraph * build_openelm() {
  6821. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6822. const int64_t n_embd_head = hparams.n_embd_head_v;
  6823. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6824. struct ggml_tensor * cur;
  6825. struct ggml_tensor * inpL;
  6826. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6827. // inp_pos - contains the positions
  6828. struct ggml_tensor * inp_pos = build_inp_pos();
  6829. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6830. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6831. for (int il = 0; il < n_layer; ++il) {
  6832. const int64_t n_head = hparams.n_head(il);
  6833. const int64_t n_head_kv = hparams.n_head_kv(il);
  6834. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  6835. cur = inpL;
  6836. struct ggml_tensor * residual = cur;
  6837. // norm
  6838. cur = llm_build_norm(ctx0, inpL, hparams,
  6839. model.layers[il].attn_norm, NULL,
  6840. LLM_NORM_RMS, cb, il);
  6841. cb(cur, "attn_norm", il);
  6842. // self-attention
  6843. {
  6844. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  6845. cb(cur, "wqkv", il);
  6846. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  6847. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
  6848. cb(Qcur, "Qcur", il);
  6849. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
  6850. cb(Kcur, "Kcur", il);
  6851. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  6852. cb(Vcur, "Vcur", il);
  6853. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  6854. model.layers[il].attn_q_norm, NULL,
  6855. LLM_NORM_RMS, cb, il);
  6856. cb(Qcur, "Qcur", il);
  6857. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  6858. model.layers[il].attn_k_norm, NULL,
  6859. LLM_NORM_RMS, cb, il);
  6860. cb(Kcur, "Kcur", il);
  6861. Qcur = ggml_rope_ext(
  6862. ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
  6863. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  6864. );
  6865. cb(Qcur, "Qcur", il);
  6866. Kcur = ggml_rope_ext(
  6867. ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
  6868. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  6869. );
  6870. cb(Kcur, "Kcur", il);
  6871. Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
  6872. cb(Qcur, "Vcur", il);
  6873. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6874. model.layers[il].wo, NULL,
  6875. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6876. }
  6877. if (il == n_layer - 1) {
  6878. // skip computing output for unused tokens
  6879. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6880. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  6881. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6882. }
  6883. struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  6884. cb(ffn_inp, "ffn_inp", il);
  6885. // feed-forward network
  6886. {
  6887. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  6888. model.layers[il].ffn_norm, NULL,
  6889. LLM_NORM_RMS, cb, il);
  6890. cb(cur, "ffn_norm", il);
  6891. cur = llm_build_ffn(ctx0, lctx, cur,
  6892. model.layers[il].ffn_up, NULL, NULL,
  6893. model.layers[il].ffn_gate, NULL, NULL,
  6894. model.layers[il].ffn_down, NULL, NULL,
  6895. NULL,
  6896. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  6897. cb(cur, "ffn_out", il);
  6898. }
  6899. cur = ggml_add(ctx0, cur, ffn_inp);
  6900. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6901. cb(cur, "l_out", il);
  6902. inpL = cur;
  6903. }
  6904. cur = inpL;
  6905. // norm
  6906. cur = llm_build_norm(ctx0, cur, hparams,
  6907. model.output_norm, NULL,
  6908. LLM_NORM_RMS, cb, -1);
  6909. cb(cur, "result_norm", -1);
  6910. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  6911. cb(cur, "result_output", -1);
  6912. ggml_build_forward_expand(gf, cur);
  6913. return gf;
  6914. }
  6915. struct ggml_cgraph * build_gptneox() {
  6916. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  6917. const int64_t n_embd_head = hparams.n_embd_head_v;
  6918. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  6919. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  6920. struct ggml_tensor * cur;
  6921. struct ggml_tensor * inpL;
  6922. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  6923. // inp_pos - contains the positions
  6924. struct ggml_tensor * inp_pos = build_inp_pos();
  6925. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  6926. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  6927. for (int il = 0; il < n_layer; ++il) {
  6928. cur = llm_build_norm(ctx0, inpL, hparams,
  6929. model.layers[il].attn_norm,
  6930. model.layers[il].attn_norm_b,
  6931. LLM_NORM, cb, il);
  6932. cb(cur, "attn_norm", il);
  6933. // self-attention
  6934. {
  6935. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  6936. cb(cur, "wqkv", il);
  6937. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  6938. cb(cur, "bqkv", il);
  6939. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  6940. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  6941. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  6942. cb(Qcur, "Qcur", il);
  6943. cb(Kcur, "Kcur", il);
  6944. cb(Vcur, "Vcur", il);
  6945. Qcur = ggml_rope_ext(
  6946. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  6947. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6948. ext_factor, attn_factor, beta_fast, beta_slow
  6949. );
  6950. cb(Qcur, "Qcur", il);
  6951. Kcur = ggml_rope_ext(
  6952. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  6953. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  6954. ext_factor, attn_factor, beta_fast, beta_slow
  6955. );
  6956. cb(Kcur, "Kcur", il);
  6957. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  6958. model.layers[il].wo, model.layers[il].bo,
  6959. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  6960. }
  6961. if (il == n_layer - 1) {
  6962. // skip computing output for unused tokens
  6963. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  6964. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  6965. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  6966. }
  6967. // ffn
  6968. if (hparams.use_par_res) {
  6969. // attention and ffn are computed in parallel
  6970. // x = x + attn(ln1(x)) + ffn(ln2(x))
  6971. struct ggml_tensor * attn_out = cur;
  6972. cur = llm_build_norm(ctx0, inpL, hparams,
  6973. model.layers[il].ffn_norm,
  6974. model.layers[il].ffn_norm_b,
  6975. LLM_NORM, cb, il);
  6976. cb(cur, "ffn_norm", il);
  6977. cur = llm_build_ffn(ctx0, lctx, cur,
  6978. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  6979. NULL, NULL, NULL,
  6980. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  6981. NULL,
  6982. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  6983. cb(cur, "ffn_out", il);
  6984. cur = ggml_add(ctx0, cur, inpL);
  6985. cb(cur, "ffn_out", il);
  6986. cur = ggml_add(ctx0, cur, attn_out);
  6987. cur = lctx.cvec.apply_to(ctx0, cur, il);
  6988. cb(cur, "l_out", il);
  6989. // input for next layer
  6990. inpL = cur;
  6991. } else {
  6992. // attention and ffn are computed sequentially
  6993. // x = x + attn(ln1(x))
  6994. // x = x + ffn(ln2(x))
  6995. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  6996. cb(ffn_inp, "ffn_inp", il);
  6997. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  6998. model.layers[il].ffn_norm,
  6999. model.layers[il].ffn_norm_b,
  7000. LLM_NORM, cb, il);
  7001. cb(cur, "ffn_norm", il);
  7002. cur = llm_build_ffn(ctx0, lctx, cur,
  7003. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7004. NULL, NULL, NULL,
  7005. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7006. NULL,
  7007. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  7008. cb(cur, "ffn_out", il);
  7009. cur = ggml_add(ctx0, cur, ffn_inp);
  7010. cur = lctx.cvec.apply_to(ctx0, cur, il);
  7011. cb(cur, "l_out", il);
  7012. // input for next layer
  7013. inpL = cur;
  7014. }
  7015. }
  7016. cur = llm_build_norm(ctx0, inpL, hparams,
  7017. model.output_norm,
  7018. model.output_norm_b,
  7019. LLM_NORM, cb, -1);
  7020. cb(cur, "result_norm", -1);
  7021. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7022. cb(cur, "result_output", -1);
  7023. ggml_build_forward_expand(gf, cur);
  7024. return gf;
  7025. }
  7026. struct ggml_cgraph * build_arctic() {
  7027. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7028. // mutable variable, needed during the last layer of the computation to skip unused tokens
  7029. int32_t n_tokens = this->n_tokens;
  7030. const int64_t n_embd_head = hparams.n_embd_head_v;
  7031. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7032. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7033. struct ggml_tensor * cur;
  7034. struct ggml_tensor * inpL;
  7035. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7036. // inp_pos - contains the positions
  7037. struct ggml_tensor * inp_pos = build_inp_pos();
  7038. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7039. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7040. for (int il = 0; il < n_layer; ++il) {
  7041. struct ggml_tensor * inpSA = inpL;
  7042. // norm
  7043. cur = llm_build_norm(ctx0, inpL, hparams,
  7044. model.layers[il].attn_norm, NULL,
  7045. LLM_NORM_RMS, cb, il);
  7046. cb(cur, "attn_norm", il);
  7047. // self-attention
  7048. {
  7049. // compute Q and K and RoPE them
  7050. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  7051. cb(Qcur, "Qcur", il);
  7052. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  7053. cb(Kcur, "Kcur", il);
  7054. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  7055. cb(Vcur, "Vcur", il);
  7056. Qcur = ggml_rope_ext(
  7057. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  7058. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7059. ext_factor, attn_factor, beta_fast, beta_slow
  7060. );
  7061. cb(Qcur, "Qcur", il);
  7062. Kcur = ggml_rope_ext(
  7063. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  7064. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7065. ext_factor, attn_factor, beta_fast, beta_slow
  7066. );
  7067. cb(Kcur, "Kcur", il);
  7068. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7069. model.layers[il].wo, NULL,
  7070. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  7071. }
  7072. if (il == n_layer - 1) {
  7073. // skip computing output for unused tokens
  7074. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7075. n_tokens = n_outputs;
  7076. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7077. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7078. }
  7079. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7080. cb(ffn_inp, "ffn_inp", il);
  7081. // feed-forward network
  7082. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7083. model.layers[il].ffn_norm, NULL,
  7084. LLM_NORM_RMS, cb, il);
  7085. cb(cur, "ffn_norm", il);
  7086. cur = llm_build_ffn(ctx0, lctx, cur,
  7087. model.layers[il].ffn_up, NULL, NULL,
  7088. model.layers[il].ffn_gate, NULL, NULL,
  7089. model.layers[il].ffn_down, NULL, NULL,
  7090. NULL,
  7091. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7092. cb(cur, "ffn_out", il);
  7093. struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  7094. cb(ffn_out, "ffn_out", il);
  7095. // MoE
  7096. cur = llm_build_norm(ctx0, inpSA, hparams,
  7097. model.layers[il].ffn_norm_exps, NULL,
  7098. LLM_NORM_RMS, cb, il);
  7099. cb(cur, "ffn_norm_exps", il);
  7100. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  7101. model.layers[il].ffn_gate_inp,
  7102. model.layers[il].ffn_up_exps,
  7103. model.layers[il].ffn_gate_exps,
  7104. model.layers[il].ffn_down_exps,
  7105. nullptr,
  7106. n_expert, n_expert_used,
  7107. LLM_FFN_SILU, true,
  7108. false, 0.0,
  7109. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7110. cb, il);
  7111. cb(cur, "ffn_moe_out", il);
  7112. cur = ggml_add(ctx0, cur, ffn_out);
  7113. cb(cur, "ffn_out", il);
  7114. cur = lctx.cvec.apply_to(ctx0, cur, il);
  7115. cb(cur, "l_out", il);
  7116. // input for next layer
  7117. inpL = cur;
  7118. }
  7119. cur = inpL;
  7120. cur = llm_build_norm(ctx0, cur, hparams,
  7121. model.output_norm, NULL,
  7122. LLM_NORM_RMS, cb, -1);
  7123. cb(cur, "result_norm", -1);
  7124. // lm_head
  7125. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7126. cb(cur, "result_output", -1);
  7127. ggml_build_forward_expand(gf, cur);
  7128. return gf;
  7129. }
  7130. struct ggml_cgraph * build_deepseek() {
  7131. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7132. // mutable variable, needed during the last layer of the computation to skip unused tokens
  7133. int32_t n_tokens = this->n_tokens;
  7134. const int64_t n_embd_head = hparams.n_embd_head_v;
  7135. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7136. GGML_ASSERT(n_embd_head == hparams.n_rot);
  7137. struct ggml_tensor * cur;
  7138. struct ggml_tensor * inpL;
  7139. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7140. // inp_pos - contains the positions
  7141. struct ggml_tensor * inp_pos = build_inp_pos();
  7142. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7143. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7144. const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
  7145. for (int il = 0; il < n_layer; ++il) {
  7146. struct ggml_tensor * inpSA = inpL;
  7147. // norm
  7148. cur = llm_build_norm(ctx0, inpL, hparams,
  7149. model.layers[il].attn_norm, NULL,
  7150. LLM_NORM_RMS, cb, il);
  7151. cb(cur, "attn_norm", il);
  7152. // self-attention
  7153. {
  7154. // rope freq factors for llama3; may return nullptr for llama2 and other models
  7155. struct ggml_tensor * rope_factors = build_rope_factors(il);
  7156. // compute Q and K and RoPE them
  7157. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  7158. cb(Qcur, "Qcur", il);
  7159. if (model.layers[il].bq) {
  7160. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7161. cb(Qcur, "Qcur", il);
  7162. }
  7163. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  7164. cb(Kcur, "Kcur", il);
  7165. if (model.layers[il].bk) {
  7166. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7167. cb(Kcur, "Kcur", il);
  7168. }
  7169. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  7170. cb(Vcur, "Vcur", il);
  7171. if (model.layers[il].bv) {
  7172. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7173. cb(Vcur, "Vcur", il);
  7174. }
  7175. Qcur = ggml_rope_ext(
  7176. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  7177. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7178. ext_factor, attn_factor, beta_fast, beta_slow
  7179. );
  7180. cb(Qcur, "Qcur", il);
  7181. Kcur = ggml_rope_ext(
  7182. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  7183. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7184. ext_factor, attn_factor, beta_fast, beta_slow
  7185. );
  7186. cb(Kcur, "Kcur", il);
  7187. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7188. model.layers[il].wo, model.layers[il].bo,
  7189. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  7190. }
  7191. if (il == n_layer - 1) {
  7192. // skip computing output for unused tokens
  7193. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7194. n_tokens = n_outputs;
  7195. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7196. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7197. }
  7198. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7199. cb(ffn_inp, "ffn_inp", il);
  7200. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7201. model.layers[il].ffn_norm, NULL,
  7202. LLM_NORM_RMS, cb, il);
  7203. cb(cur, "ffn_norm", il);
  7204. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  7205. cur = llm_build_ffn(ctx0, lctx, cur,
  7206. model.layers[il].ffn_up, NULL, NULL,
  7207. model.layers[il].ffn_gate, NULL, NULL,
  7208. model.layers[il].ffn_down, NULL, NULL,
  7209. NULL,
  7210. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7211. cb(cur, "ffn_out", il);
  7212. } else {
  7213. // MoE branch
  7214. ggml_tensor * moe_out =
  7215. llm_build_moe_ffn(ctx0, lctx, cur,
  7216. model.layers[il].ffn_gate_inp,
  7217. model.layers[il].ffn_up_exps,
  7218. model.layers[il].ffn_gate_exps,
  7219. model.layers[il].ffn_down_exps,
  7220. nullptr,
  7221. n_expert, n_expert_used,
  7222. LLM_FFN_SILU, false,
  7223. false, hparams.expert_weights_scale,
  7224. LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
  7225. cb, il);
  7226. cb(moe_out, "ffn_moe_out", il);
  7227. // FFN shared expert
  7228. {
  7229. ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
  7230. model.layers[il].ffn_up_shexp, NULL, NULL,
  7231. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7232. model.layers[il].ffn_down_shexp, NULL, NULL,
  7233. NULL,
  7234. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7235. cb(ffn_shexp, "ffn_shexp", il);
  7236. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  7237. cb(cur, "ffn_out", il);
  7238. }
  7239. }
  7240. cur = ggml_add(ctx0, cur, ffn_inp);
  7241. cur = lctx.cvec.apply_to(ctx0, cur, il);
  7242. cb(cur, "l_out", il);
  7243. // input for next layer
  7244. inpL = cur;
  7245. }
  7246. cur = inpL;
  7247. cur = llm_build_norm(ctx0, cur, hparams,
  7248. model.output_norm, NULL,
  7249. LLM_NORM_RMS, cb, -1);
  7250. cb(cur, "result_norm", -1);
  7251. // lm_head
  7252. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7253. cb(cur, "result_output", -1);
  7254. ggml_build_forward_expand(gf, cur);
  7255. return gf;
  7256. }
  7257. struct ggml_cgraph * build_deepseek2() {
  7258. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7259. // mutable variable, needed during the last layer of the computation to skip unused tokens
  7260. int32_t n_tokens = this->n_tokens;
  7261. bool is_lite = (hparams.n_layer == 27);
  7262. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  7263. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  7264. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  7265. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
  7266. const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  7267. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  7268. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  7269. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  7270. struct ggml_tensor * cur;
  7271. struct ggml_tensor * inpL;
  7272. // {n_embd, n_tokens}
  7273. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7274. // inp_pos - contains the positions
  7275. struct ggml_tensor * inp_pos = build_inp_pos();
  7276. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7277. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7278. for (int il = 0; il < n_layer; ++il) {
  7279. struct ggml_tensor * inpSA = inpL;
  7280. // norm
  7281. cur = llm_build_norm(ctx0, inpL, hparams,
  7282. model.layers[il].attn_norm, NULL,
  7283. LLM_NORM_RMS, cb, il);
  7284. cb(cur, "attn_norm", il);
  7285. // self_attention
  7286. {
  7287. struct ggml_tensor * q = NULL;
  7288. if (!is_lite) {
  7289. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  7290. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  7291. cb(q, "q", il);
  7292. q = llm_build_norm(ctx0, q, hparams,
  7293. model.layers[il].attn_q_a_norm, NULL,
  7294. LLM_NORM_RMS, cb, il);
  7295. cb(q, "q", il);
  7296. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  7297. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  7298. cb(q, "q", il);
  7299. } else {
  7300. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  7301. cb(q, "q", il);
  7302. }
  7303. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7304. struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  7305. ggml_row_size(q->type, hparams.n_embd_head_k),
  7306. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7307. 0);
  7308. cb(q_nope, "q_nope", il);
  7309. // and {n_head * n_embd_head_qk_rope, n_tokens}
  7310. struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  7311. ggml_row_size(q->type, hparams.n_embd_head_k),
  7312. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  7313. ggml_row_size(q->type, n_embd_head_qk_nope));
  7314. cb(q_pe, "q_pe", il);
  7315. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  7316. struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  7317. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  7318. // split into {kv_lora_rank, n_tokens}
  7319. struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  7320. kv_pe_compresseed->nb[1],
  7321. 0);
  7322. cb(kv_compressed, "kv_compressed", il);
  7323. // and {n_embd_head_qk_rope, n_tokens}
  7324. struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  7325. kv_pe_compresseed->nb[1],
  7326. kv_pe_compresseed->nb[1],
  7327. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  7328. cb(k_pe, "k_pe", il);
  7329. kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
  7330. kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
  7331. model.layers[il].attn_kv_a_norm, NULL,
  7332. LLM_NORM_RMS, cb, il);
  7333. cb(kv_compressed, "kv_compressed", il);
  7334. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  7335. struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  7336. cb(kv, "kv", il);
  7337. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  7338. struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  7339. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  7340. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7341. 0);
  7342. cb(k_nope, "k_nope", il);
  7343. // and {n_head * n_embd_head_v, n_tokens}
  7344. struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  7345. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  7346. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  7347. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  7348. cb(v_states, "v_states", il);
  7349. v_states = ggml_cont(ctx0, v_states);
  7350. cb(v_states, "v_states", il);
  7351. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  7352. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  7353. 0);
  7354. cb(v_states, "v_states", il);
  7355. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  7356. q_pe = ggml_rope_ext(
  7357. ctx0, q_pe, inp_pos, nullptr,
  7358. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7359. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  7360. );
  7361. cb(q_pe, "q_pe", il);
  7362. // shared RoPE key
  7363. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  7364. k_pe = ggml_rope_ext(
  7365. ctx0, k_pe, inp_pos, nullptr,
  7366. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7367. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  7368. );
  7369. cb(k_pe, "k_pe", il);
  7370. struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  7371. cb(q_states, "q_states", il);
  7372. struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  7373. cb(k_states, "k_states", il);
  7374. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7375. model.layers[il].wo, NULL,
  7376. k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  7377. }
  7378. if (il == n_layer - 1) {
  7379. // skip computing output for unused tokens
  7380. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7381. n_tokens = n_outputs;
  7382. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7383. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7384. }
  7385. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7386. cb(ffn_inp, "ffn_inp", il);
  7387. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7388. model.layers[il].ffn_norm, NULL,
  7389. LLM_NORM_RMS, cb, il);
  7390. cb(cur, "ffn_norm", il);
  7391. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  7392. cur = llm_build_ffn(ctx0, lctx, cur,
  7393. model.layers[il].ffn_up, NULL, NULL,
  7394. model.layers[il].ffn_gate, NULL, NULL,
  7395. model.layers[il].ffn_down, NULL, NULL,
  7396. NULL,
  7397. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7398. cb(cur, "ffn_out", il);
  7399. } else {
  7400. // MoE branch
  7401. ggml_tensor * moe_out =
  7402. llm_build_moe_ffn(ctx0, lctx, cur,
  7403. model.layers[il].ffn_gate_inp,
  7404. model.layers[il].ffn_up_exps,
  7405. model.layers[il].ffn_gate_exps,
  7406. model.layers[il].ffn_down_exps,
  7407. model.layers[il].ffn_exp_probs_b,
  7408. n_expert, n_expert_used,
  7409. LLM_FFN_SILU, hparams.expert_weights_norm,
  7410. true, hparams.expert_weights_scale,
  7411. (enum llama_expert_gating_func_type) hparams.expert_gating_func,
  7412. cb, il);
  7413. cb(moe_out, "ffn_moe_out", il);
  7414. // FFN shared expert
  7415. {
  7416. ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
  7417. model.layers[il].ffn_up_shexp, NULL, NULL,
  7418. model.layers[il].ffn_gate_shexp, NULL, NULL,
  7419. model.layers[il].ffn_down_shexp, NULL, NULL,
  7420. NULL,
  7421. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7422. cb(ffn_shexp, "ffn_shexp", il);
  7423. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  7424. cb(cur, "ffn_out", il);
  7425. }
  7426. }
  7427. cur = ggml_add(ctx0, cur, ffn_inp);
  7428. cur = lctx.cvec.apply_to(ctx0, cur, il);
  7429. cb(cur, "l_out", il);
  7430. // input for next layer
  7431. inpL = cur;
  7432. }
  7433. cur = inpL;
  7434. cur = llm_build_norm(ctx0, cur, hparams,
  7435. model.output_norm, NULL,
  7436. LLM_NORM_RMS, cb, -1);
  7437. cb(cur, "result_norm", -1);
  7438. // lm_head
  7439. cur = ggml_mul_mat(ctx0, model.output, cur);
  7440. cb(cur, "result_output", -1);
  7441. ggml_build_forward_expand(gf, cur);
  7442. return gf;
  7443. }
  7444. struct ggml_cgraph * build_bitnet() {
  7445. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7446. const int64_t n_embd_head = hparams.n_embd_head_v;
  7447. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7448. struct ggml_tensor * cur;
  7449. struct ggml_tensor * inpL;
  7450. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7451. // inp_pos - contains the positions
  7452. struct ggml_tensor * inp_pos = build_inp_pos();
  7453. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7454. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7455. for (int il = 0; il < n_layer; ++il) {
  7456. struct ggml_tensor * inpSA = inpL;
  7457. cur = llm_build_norm(ctx0, inpL, hparams,
  7458. model.layers[il].attn_norm, NULL,
  7459. LLM_NORM_RMS, cb, il);
  7460. cb(cur, "attn_norm", il);
  7461. // self-attention
  7462. {
  7463. // compute Q and K and RoPE them
  7464. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  7465. if (model.layers[il].wq_scale) {
  7466. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  7467. }
  7468. cb(Qcur, "Qcur", il);
  7469. if (model.layers[il].bq) {
  7470. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  7471. cb(Qcur, "Qcur", il);
  7472. }
  7473. // B1.K
  7474. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  7475. if (model.layers[il].wk_scale) {
  7476. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  7477. }
  7478. cb(Kcur, "Kcur", il);
  7479. if (model.layers[il].bk) {
  7480. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  7481. cb(Kcur, "Kcur", il);
  7482. }
  7483. // B1.V
  7484. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  7485. if (model.layers[il].wv_scale) {
  7486. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  7487. }
  7488. cb(Vcur, "Vcur", il);
  7489. if (model.layers[il].bv) {
  7490. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  7491. cb(Vcur, "Vcur", il);
  7492. }
  7493. Qcur = ggml_rope_ext(
  7494. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  7495. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7496. ext_factor, attn_factor, beta_fast, beta_slow
  7497. );
  7498. cb(Qcur, "Qcur", il);
  7499. Kcur = ggml_rope_ext(
  7500. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  7501. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7502. ext_factor, attn_factor, beta_fast, beta_slow
  7503. );
  7504. cb(Kcur, "Kcur", il);
  7505. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7506. NULL, NULL,
  7507. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  7508. cur = llm_build_norm(ctx0, cur, hparams,
  7509. model.layers[il].attn_sub_norm, NULL,
  7510. LLM_NORM_RMS, cb, il);
  7511. cb(cur, "attn_sub_norm", il);
  7512. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  7513. if (model.layers[il].wo_scale) {
  7514. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  7515. }
  7516. if (model.layers[il].bo) {
  7517. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  7518. }
  7519. cb(cur, "attn_o_out", il);
  7520. }
  7521. if (il == n_layer - 1) {
  7522. // skip computing output for unused tokens
  7523. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7524. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7525. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7526. }
  7527. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7528. cb(ffn_inp, "ffn_inp", il);
  7529. // feed-forward forward
  7530. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7531. model.layers[il].ffn_norm, NULL,
  7532. LLM_NORM_RMS, cb, il);
  7533. cb(cur, "ffn_norm", il);
  7534. cur = llm_build_ffn(ctx0, lctx, cur,
  7535. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  7536. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  7537. NULL, NULL, NULL,
  7538. NULL,
  7539. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7540. cb(cur, "ffn_sub_out", il);
  7541. cur = llm_build_norm(ctx0, cur, hparams,
  7542. model.layers[il].ffn_sub_norm, NULL,
  7543. LLM_NORM_RMS, cb, il);
  7544. cb(cur, "ffn_sub_norm", il);
  7545. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
  7546. if (model.layers[il].ffn_down_scale) {
  7547. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  7548. }
  7549. cb(cur, "ffn_down", il);
  7550. cur = ggml_add(ctx0, cur, ffn_inp);
  7551. cb(cur, "l_out", il);
  7552. // input for next layer
  7553. inpL = cur;
  7554. }
  7555. cur = inpL;
  7556. cur = llm_build_norm(ctx0, cur, hparams,
  7557. model.output_norm, NULL,
  7558. LLM_NORM_RMS, cb, -1);
  7559. cb(cur, "result_norm", -1);
  7560. // lm_head
  7561. // FIXME: do not use model.tok_embd directly, duplicate as model.output
  7562. cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
  7563. cb(cur, "result_output", -1);
  7564. ggml_build_forward_expand(gf, cur);
  7565. return gf;
  7566. }
  7567. struct ggml_cgraph * build_t5_enc() {
  7568. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7569. // mutable variable, needed during the last layer of the computation to skip unused tokens
  7570. int32_t n_tokens = this->n_tokens;
  7571. const int64_t n_embd_head = hparams.n_embd_head_v;
  7572. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7573. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7574. struct ggml_tensor * cur;
  7575. struct ggml_tensor * inpL;
  7576. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7577. GGML_ASSERT(lctx.is_encoding);
  7578. struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
  7579. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7580. struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
  7581. for (int il = 0; il < n_layer; ++il) {
  7582. struct ggml_tensor * inpSA = inpL;
  7583. // norm
  7584. cur = llm_build_norm(ctx0, inpL, hparams,
  7585. model.layers[il].attn_norm_enc, NULL,
  7586. LLM_NORM_RMS, cb, il);
  7587. cb(cur, "attn_norm", il);
  7588. // self-attention
  7589. {
  7590. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
  7591. cb(Qcur, "Qcur", il);
  7592. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
  7593. cb(Kcur, "Kcur", il);
  7594. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
  7595. cb(Vcur, "Vcur", il);
  7596. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7597. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  7598. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  7599. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  7600. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  7601. cb(kq, "kq", il);
  7602. struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  7603. struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
  7604. struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
  7605. cb(kq_b, "kq_b", il);
  7606. kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
  7607. cb(kq, "kq_soft_max_ext", il);
  7608. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
  7609. cb(v, "v", il);
  7610. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
  7611. cb(kqv, "kqv", il);
  7612. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  7613. cb(kqv_merged, "kqv_merged", il);
  7614. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  7615. cb(cur, "kqv_merged_cont", il);
  7616. ggml_build_forward_expand(gf, cur);
  7617. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
  7618. cb(cur, "kqv_out", il);
  7619. }
  7620. if (il == n_layer - 1) {
  7621. // skip computing output for unused tokens
  7622. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7623. n_tokens = n_outputs;
  7624. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7625. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7626. }
  7627. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7628. cb(ffn_inp, "ffn_inp", il);
  7629. // feed-forward network
  7630. {
  7631. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7632. model.layers[il].ffn_norm_enc, NULL,
  7633. LLM_NORM_RMS, cb, il);
  7634. cb(cur, "ffn_norm", il);
  7635. // T5 uses relu, flan-T5 uses gelu-gated
  7636. cur = llm_build_ffn(ctx0, lctx, cur,
  7637. model.layers[il].ffn_up_enc, NULL, NULL,
  7638. model.layers[il].ffn_gate_enc, NULL, NULL,
  7639. model.layers[il].ffn_down_enc, NULL, NULL,
  7640. NULL,
  7641. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  7642. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  7643. cb, il);
  7644. cb(cur, "ffn_out", il);
  7645. }
  7646. cur = ggml_add(ctx0, cur, ffn_inp);
  7647. cb(cur, "ffn_out", il);
  7648. ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
  7649. if (layer_dir != nullptr) {
  7650. cur = ggml_add(ctx0, cur, layer_dir);
  7651. }
  7652. cb(cur, "l_out", il);
  7653. // input for next layer
  7654. inpL = cur;
  7655. }
  7656. cur = inpL;
  7657. cb(cur, "result_embd", -1);
  7658. cur = llm_build_norm(ctx0, cur, hparams,
  7659. model.output_norm_enc, NULL,
  7660. LLM_NORM_RMS, cb, -1);
  7661. cb(cur, "result_norm", -1);
  7662. ggml_build_forward_expand(gf, cur);
  7663. return gf;
  7664. }
  7665. struct ggml_cgraph * build_t5_dec() {
  7666. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7667. // mutable variable, needed during the last layer of the computation to skip unused tokens
  7668. int32_t n_tokens = this->n_tokens;
  7669. const int64_t n_embd_head = hparams.n_embd_head_v;
  7670. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7671. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7672. struct ggml_tensor * cur;
  7673. struct ggml_tensor * inpL;
  7674. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7675. GGML_ASSERT(!lctx.is_encoding);
  7676. GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
  7677. struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
  7678. struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
  7679. struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
  7680. struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
  7681. for (int il = 0; il < n_layer; ++il) {
  7682. struct ggml_tensor * inpSA = inpL;
  7683. // norm
  7684. cur = llm_build_norm(ctx0, inpL, hparams,
  7685. model.layers[il].attn_norm, NULL,
  7686. LLM_NORM_RMS, cb, il);
  7687. cb(cur, "attn_norm", il);
  7688. // self-attention
  7689. {
  7690. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  7691. cb(Qcur, "Qcur", il);
  7692. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  7693. cb(Kcur, "Kcur", il);
  7694. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  7695. cb(Vcur, "Vcur", il);
  7696. llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
  7697. struct ggml_tensor * k =
  7698. ggml_view_3d(ctx0, kv_self.k_l[il],
  7699. n_embd_head_k, n_kv, n_head_kv,
  7700. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  7701. ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
  7702. 0);
  7703. cb(k, "k", il);
  7704. struct ggml_tensor * v =
  7705. ggml_view_3d(ctx0, kv_self.v_l[il],
  7706. n_kv, n_embd_head_v, n_head_kv,
  7707. ggml_element_size(kv_self.v_l[il])*n_ctx,
  7708. ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
  7709. 0);
  7710. cb(v, "v", il);
  7711. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7712. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  7713. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  7714. cb(kq, "kq", il);
  7715. struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  7716. struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
  7717. struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
  7718. cb(kq_b, "kq_b", il);
  7719. kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
  7720. cb(kq, "kq_soft_max_ext", il);
  7721. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
  7722. cb(kqv, "kqv", il);
  7723. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  7724. cb(kqv_merged, "kqv_merged", il);
  7725. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  7726. cb(cur, "kqv_merged_cont", il);
  7727. ggml_build_forward_expand(gf, cur);
  7728. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  7729. cb(cur, "kqv_out", il);
  7730. }
  7731. cur = ggml_add(ctx0, cur, inpSA);
  7732. cb(cur, "cross_inp", il);
  7733. struct ggml_tensor * inpCA = cur;
  7734. // norm
  7735. cur = llm_build_norm(ctx0, cur, hparams,
  7736. model.layers[il].attn_norm_cross, NULL,
  7737. LLM_NORM_RMS, cb, il);
  7738. cb(cur, "attn_norm_cross", il);
  7739. // cross-attention
  7740. {
  7741. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
  7742. cb(Qcur, "Qcur", il);
  7743. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
  7744. cb(Kcur, "Kcur", il);
  7745. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
  7746. cb(Vcur, "Vcur", il);
  7747. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7748. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  7749. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  7750. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  7751. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  7752. cb(kq, "kq", il);
  7753. kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  7754. cb(kq, "kq_soft_max_ext", il);
  7755. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  7756. cb(v, "v", il);
  7757. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  7758. cb(kqv, "kqv", il);
  7759. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  7760. cb(kqv_merged, "kqv_merged", il);
  7761. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  7762. cb(cur, "kqv_merged_cont", il);
  7763. ggml_build_forward_expand(gf, cur);
  7764. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
  7765. cb(cur, "kqv_out", il);
  7766. }
  7767. if (il == n_layer - 1) {
  7768. // skip computing output for unused tokens
  7769. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7770. n_tokens = n_outputs;
  7771. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7772. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7773. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  7774. }
  7775. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  7776. cb(ffn_inp, "ffn_inp", il);
  7777. // feed-forward network
  7778. {
  7779. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7780. model.layers[il].ffn_norm, NULL,
  7781. LLM_NORM_RMS, cb, il);
  7782. cb(cur, "ffn_norm", il);
  7783. // T5 uses relu, flan-T5 uses gelu-gated
  7784. cur = llm_build_ffn(ctx0, lctx, cur,
  7785. model.layers[il].ffn_up, NULL, NULL,
  7786. model.layers[il].ffn_gate, NULL, NULL,
  7787. model.layers[il].ffn_down, NULL, NULL,
  7788. NULL,
  7789. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  7790. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  7791. cb, il);
  7792. cb(cur, "ffn_out", il);
  7793. }
  7794. cur = ggml_add(ctx0, cur, ffn_inp);
  7795. cb(cur, "ffn_out", il);
  7796. ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
  7797. if (layer_dir != nullptr) {
  7798. cur = ggml_add(ctx0, cur, layer_dir);
  7799. }
  7800. cb(cur, "l_out", il);
  7801. // input for next layer
  7802. inpL = cur;
  7803. }
  7804. cur = inpL;
  7805. cb(cur, "result_embd", -1);
  7806. cur = llm_build_norm(ctx0, cur, hparams,
  7807. model.output_norm, NULL,
  7808. LLM_NORM_RMS, cb, -1);
  7809. cb(cur, "result_norm", -1);
  7810. // lm_head
  7811. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7812. cb(cur, "result_output", -1);
  7813. ggml_build_forward_expand(gf, cur);
  7814. return gf;
  7815. }
  7816. struct ggml_cgraph * build_jais() {
  7817. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7818. const int64_t n_embd_head = hparams.n_embd_head_v;
  7819. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7820. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7821. struct ggml_tensor * cur;
  7822. struct ggml_tensor * inpL;
  7823. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7824. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7825. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7826. for (int il = 0; il < n_layer; ++il) {
  7827. cur = llm_build_norm(ctx0, inpL, hparams,
  7828. model.layers[il].attn_norm,
  7829. model.layers[il].attn_norm_b,
  7830. LLM_NORM, cb, il);
  7831. cb(cur, "attn_norm", il);
  7832. // self-attention
  7833. {
  7834. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  7835. cb(cur, "wqkv", il);
  7836. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7837. cb(cur, "bqkv", il);
  7838. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
  7839. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
  7840. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
  7841. cb(Qcur, "Qcur", il);
  7842. cb(Kcur, "Kcur", il);
  7843. cb(Vcur, "Vcur", il);
  7844. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  7845. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7846. model.layers[il].wo, model.layers[il].bo,
  7847. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
  7848. }
  7849. if (il == n_layer - 1) {
  7850. // skip computing output for unused tokens
  7851. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7852. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7853. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  7854. }
  7855. // add the input
  7856. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  7857. cb(ffn_inp, "ffn_inp", il);
  7858. // FF
  7859. {
  7860. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7861. model.layers[il].ffn_norm,
  7862. model.layers[il].ffn_norm_b,
  7863. LLM_NORM, cb, il);
  7864. cb(cur, "ffn_norm", il);
  7865. cur = llm_build_ffn(ctx0, lctx, cur,
  7866. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  7867. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  7868. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  7869. NULL,
  7870. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  7871. cb(cur, "ffn_out", il);
  7872. }
  7873. inpL = ggml_add(ctx0, cur, ffn_inp);
  7874. cb(inpL, "l_out", il);
  7875. }
  7876. cur = llm_build_norm(ctx0, inpL, hparams,
  7877. model.output_norm,
  7878. model.output_norm_b,
  7879. LLM_NORM, cb, -1);
  7880. cb(cur, "result_norm", -1);
  7881. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7882. cb(cur, "result_output", -1);
  7883. ggml_build_forward_expand(gf, cur);
  7884. return gf;
  7885. }
  7886. struct ggml_cgraph * build_chatglm() {
  7887. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7888. const int64_t n_embd_head = hparams.n_embd_head_v;
  7889. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  7890. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7891. struct ggml_tensor * cur;
  7892. struct ggml_tensor * inpL;
  7893. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7894. // inp_pos - contains the positions
  7895. struct ggml_tensor * inp_pos = build_inp_pos();
  7896. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7897. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7898. for (int il = 0; il < n_layer; ++il) {
  7899. struct ggml_tensor * inpSA = inpL;
  7900. cur = llm_build_norm(ctx0, inpL, hparams,
  7901. model.layers[il].attn_norm,
  7902. NULL,
  7903. LLM_NORM_RMS, cb, il);
  7904. cb(cur, "attn_norm", il);
  7905. // self-attention
  7906. {
  7907. struct ggml_tensor * Qcur = nullptr;
  7908. struct ggml_tensor * Kcur = nullptr;
  7909. struct ggml_tensor * Vcur = nullptr;
  7910. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  7911. cb(cur, "wqkv", il);
  7912. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  7913. cb(cur, "bqkv", il);
  7914. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  7915. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  7916. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  7917. cb(Qcur, "Qcur", il);
  7918. cb(Kcur, "Kcur", il);
  7919. cb(Vcur, "Vcur", il);
  7920. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  7921. Qcur = ggml_rope_ext(
  7922. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  7923. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7924. ext_factor, attn_factor, beta_fast, beta_slow
  7925. );
  7926. cb(Qcur, "Qcur_rope", il);
  7927. Kcur = ggml_rope_ext(
  7928. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  7929. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  7930. ext_factor, attn_factor, beta_fast, beta_slow
  7931. );
  7932. cb(Kcur, "Kcur_rope", il);
  7933. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  7934. model.layers[il].wo, NULL,
  7935. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  7936. }
  7937. if (il == n_layer - 1) {
  7938. // skip computing output for unused tokens
  7939. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  7940. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  7941. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  7942. }
  7943. // Add the input
  7944. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  7945. cb(ffn_inp, "ffn_inp", il);
  7946. // FF
  7947. {
  7948. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  7949. model.layers[il].ffn_norm,
  7950. NULL,
  7951. LLM_NORM_RMS, cb, il);
  7952. cb(cur, "ffn_norm", il);
  7953. cur = llm_build_ffn(ctx0, lctx, cur,
  7954. model.layers[il].ffn_up, NULL, NULL,
  7955. NULL, NULL, NULL,
  7956. model.layers[il].ffn_down, NULL, NULL,
  7957. NULL,
  7958. LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
  7959. cb(cur, "ffn_out", il);
  7960. }
  7961. inpL = ggml_add(ctx0, cur, ffn_inp);
  7962. cb(inpL, "l_out", il);
  7963. }
  7964. cur = llm_build_norm(ctx0, inpL, hparams,
  7965. model.output_norm,
  7966. NULL,
  7967. LLM_NORM_RMS, cb, -1);
  7968. cb(cur, "result_norm", -1);
  7969. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  7970. cb(cur, "result_output", -1);
  7971. ggml_build_forward_expand(gf, cur);
  7972. return gf;
  7973. }
  7974. struct ggml_cgraph * build_nemotron() {
  7975. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  7976. const int64_t n_embd_head = hparams.n_embd_head_v;
  7977. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  7978. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  7979. struct ggml_tensor * cur;
  7980. struct ggml_tensor * inpL;
  7981. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  7982. // inp_pos - contains the positions
  7983. struct ggml_tensor * inp_pos = build_inp_pos();
  7984. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  7985. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  7986. for (int il = 0; il < n_layer; ++il) {
  7987. struct ggml_tensor * inpSA = inpL;
  7988. // norm
  7989. cur = llm_build_norm(ctx0, inpL, hparams,
  7990. model.layers[il].attn_norm,
  7991. model.layers[il].attn_norm_b,
  7992. LLM_NORM, cb, il);
  7993. cb(cur, "attn_norm", il);
  7994. // self-attention
  7995. {
  7996. // compute Q and K and RoPE them
  7997. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  7998. cb(Qcur, "Qcur", il);
  7999. if (model.layers[il].bq) {
  8000. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8001. cb(Qcur, "Qcur", il);
  8002. }
  8003. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8004. cb(Kcur, "Kcur", il);
  8005. if (model.layers[il].bk) {
  8006. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8007. cb(Kcur, "Kcur", il);
  8008. }
  8009. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8010. cb(Vcur, "Vcur", il);
  8011. if (model.layers[il].bv) {
  8012. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8013. cb(Vcur, "Vcur", il);
  8014. }
  8015. Qcur = ggml_rope_ext(
  8016. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  8017. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8018. ext_factor, attn_factor, beta_fast, beta_slow
  8019. );
  8020. cb(Qcur, "Qcur", il);
  8021. Kcur = ggml_rope_ext(
  8022. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  8023. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8024. ext_factor, attn_factor, beta_fast, beta_slow
  8025. );
  8026. cb(Kcur, "Kcur", il);
  8027. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8028. model.layers[il].wo, model.layers[il].bo,
  8029. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8030. }
  8031. if (il == n_layer - 1) {
  8032. // skip computing output for unused tokens
  8033. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8034. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8035. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8036. }
  8037. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8038. cb(ffn_inp, "ffn_inp", il);
  8039. // feed-forward network
  8040. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8041. model.layers[il].ffn_norm,
  8042. model.layers[il].ffn_norm_b,
  8043. LLM_NORM, cb, il);
  8044. cb(cur, "ffn_norm", il);
  8045. cur = llm_build_ffn(ctx0, lctx, cur,
  8046. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8047. NULL, NULL, NULL,
  8048. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8049. NULL,
  8050. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
  8051. cur = ggml_add(ctx0, cur, ffn_inp);
  8052. cb(cur, "ffn_out", il);
  8053. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8054. cb(cur, "l_out", il);
  8055. // input for next layer
  8056. inpL = cur;
  8057. }
  8058. cur = inpL;
  8059. cur = llm_build_norm(ctx0, cur, hparams,
  8060. model.output_norm, model.output_norm_b,
  8061. LLM_NORM, cb, -1);
  8062. cb(cur, "result_norm", -1);
  8063. // lm_head
  8064. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8065. cb(cur, "result_output", -1);
  8066. ggml_build_forward_expand(gf, cur);
  8067. return gf;
  8068. }
  8069. struct ggml_cgraph * build_exaone() {
  8070. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8071. // mutable variable, needed during the last layer of the computation to skip unused tokens
  8072. int32_t n_tokens = this->n_tokens;
  8073. const int64_t n_embd_head = hparams.n_embd_head_v;
  8074. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8075. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8076. struct ggml_tensor * cur;
  8077. struct ggml_tensor * inpL;
  8078. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  8079. // inp_pos - contains the positions
  8080. struct ggml_tensor * inp_pos = build_inp_pos();
  8081. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8082. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8083. for (int il = 0; il < n_layer; ++il) {
  8084. struct ggml_tensor * inpSA = inpL;
  8085. // norm
  8086. cur = llm_build_norm(ctx0, inpL, hparams,
  8087. model.layers[il].attn_norm, NULL,
  8088. LLM_NORM_RMS, cb, il);
  8089. cb(cur, "attn_norm", il);
  8090. // self-attention
  8091. {
  8092. // rope freq factors for llama3; may return nullptr for llama2 and other models
  8093. struct ggml_tensor * rope_factors = build_rope_factors(il);
  8094. // compute Q and K and RoPE them
  8095. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8096. cb(Qcur, "Qcur", il);
  8097. if (model.layers[il].bq) {
  8098. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8099. cb(Qcur, "Qcur", il);
  8100. }
  8101. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8102. cb(Kcur, "Kcur", il);
  8103. if (model.layers[il].bk) {
  8104. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8105. cb(Kcur, "Kcur", il);
  8106. }
  8107. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8108. cb(Vcur, "Vcur", il);
  8109. if (model.layers[il].bv) {
  8110. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8111. cb(Vcur, "Vcur", il);
  8112. }
  8113. Qcur = ggml_rope_ext(
  8114. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  8115. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8116. ext_factor, attn_factor, beta_fast, beta_slow
  8117. );
  8118. cb(Qcur, "Qcur", il);
  8119. Kcur = ggml_rope_ext(
  8120. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  8121. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8122. ext_factor, attn_factor, beta_fast, beta_slow
  8123. );
  8124. cb(Kcur, "Kcur", il);
  8125. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8126. model.layers[il].wo, model.layers[il].bo,
  8127. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8128. }
  8129. if (il == n_layer - 1) {
  8130. // skip computing output for unused tokens
  8131. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8132. n_tokens = n_outputs;
  8133. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8134. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8135. }
  8136. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8137. cb(ffn_inp, "ffn_inp", il);
  8138. // feed-forward network
  8139. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8140. model.layers[il].ffn_norm, NULL,
  8141. LLM_NORM_RMS, cb, il);
  8142. cb(cur, "ffn_norm", il);
  8143. cur = llm_build_ffn(ctx0, lctx, cur,
  8144. model.layers[il].ffn_up, NULL, NULL,
  8145. model.layers[il].ffn_gate, NULL, NULL,
  8146. model.layers[il].ffn_down, NULL, NULL,
  8147. NULL,
  8148. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8149. cb(cur, "ffn_out", il);
  8150. cur = ggml_add(ctx0, cur, ffn_inp);
  8151. cb(cur, "ffn_out", il);
  8152. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8153. cb(cur, "l_out", il);
  8154. // input for next layer
  8155. inpL = cur;
  8156. }
  8157. cur = inpL;
  8158. cur = llm_build_norm(ctx0, cur, hparams,
  8159. model.output_norm, NULL,
  8160. LLM_NORM_RMS, cb, -1);
  8161. cb(cur, "result_norm", -1);
  8162. // lm_head
  8163. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8164. cb(cur, "result_output", -1);
  8165. ggml_build_forward_expand(gf, cur);
  8166. return gf;
  8167. }
  8168. ggml_cgraph * build_rwkv6() {
  8169. ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8170. // Token shift state dimensions should be 2 * n_emb
  8171. GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
  8172. const int64_t n_seqs = ubatch.n_seqs;
  8173. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  8174. const int64_t n_tokens = ubatch.n_tokens;
  8175. GGML_ASSERT(n_seqs != 0);
  8176. GGML_ASSERT(ubatch.equal_seqs);
  8177. GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
  8178. struct ggml_tensor * cur;
  8179. struct ggml_tensor * inpL;
  8180. struct ggml_tensor * state_copy = build_inp_s_copy();
  8181. struct ggml_tensor * state_mask = build_inp_s_mask();
  8182. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  8183. inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
  8184. for (int il = 0; il < n_layer; ++il) {
  8185. const llama_layer * layer = &model.layers[il];
  8186. // (ab)using the KV cache to store the states
  8187. struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
  8188. gf, kv_self.k_l[il], state_copy, state_mask,
  8189. hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
  8190. struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
  8191. gf, kv_self.v_l[il], state_copy, state_mask,
  8192. hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
  8193. cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  8194. token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
  8195. struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  8196. struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  8197. struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
  8198. struct ggml_tensor * x_prev = ggml_concat(
  8199. ctx0,
  8200. att_shift,
  8201. ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
  8202. 1
  8203. );
  8204. cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
  8205. ggml_build_forward_expand(gf, cur);
  8206. ggml_build_forward_expand(
  8207. gf,
  8208. ggml_cpy(
  8209. ctx0,
  8210. wkv_states,
  8211. ggml_view_1d(
  8212. ctx0,
  8213. kv_self.v_l[il],
  8214. hparams.n_embd_v_s() * n_seqs,
  8215. hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
  8216. )
  8217. )
  8218. );
  8219. struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
  8220. x_prev = ggml_concat(
  8221. ctx0,
  8222. ffn_shift,
  8223. ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
  8224. 1
  8225. );
  8226. cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
  8227. ggml_build_forward_expand(gf, cur);
  8228. struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
  8229. struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
  8230. token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
  8231. ggml_build_forward_expand(
  8232. gf,
  8233. ggml_cpy(
  8234. ctx0,
  8235. ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
  8236. ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
  8237. )
  8238. );
  8239. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  8240. cur = ggml_scale(ctx0, cur, 0.5F);
  8241. }
  8242. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8243. cb(cur, "l_out", il);
  8244. // input for next layer
  8245. inpL = cur;
  8246. }
  8247. cur = inpL;
  8248. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8249. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  8250. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8251. cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
  8252. cb(cur, "result_norm", -1);
  8253. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8254. cb(cur, "result_output", -1);
  8255. ggml_build_forward_expand(gf, cur);
  8256. return gf;
  8257. }
  8258. // ref: https://github.com/facebookresearch/chameleon
  8259. // based on the original build_llama() function, changes:
  8260. // * qk-norm
  8261. // * swin-norm
  8262. // * removed bias
  8263. // * removed MoE
  8264. struct ggml_cgraph * build_chameleon() {
  8265. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8266. // mutable variable, needed during the last layer of the computation to skip unused tokens
  8267. int32_t n_tokens = this->n_tokens;
  8268. const int64_t n_embd_head = hparams.n_embd_head_v;
  8269. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8270. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8271. struct ggml_tensor * cur;
  8272. struct ggml_tensor * inpL;
  8273. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  8274. // inp_pos - contains the positions
  8275. struct ggml_tensor * inp_pos = build_inp_pos();
  8276. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8277. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8278. for (int il = 0; il < n_layer; ++il) {
  8279. struct ggml_tensor * inpSA = inpL;
  8280. // norm
  8281. if (hparams.swin_norm) {
  8282. cur = inpL;
  8283. } else {
  8284. cur = llm_build_norm(ctx0, inpL, hparams,
  8285. model.layers[il].attn_norm, NULL,
  8286. LLM_NORM_RMS, cb, il);
  8287. cb(cur, "attn_norm", il);
  8288. }
  8289. // self-attention
  8290. {
  8291. // compute Q and K and RoPE them
  8292. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8293. cb(Qcur, "Qcur", il);
  8294. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8295. cb(Kcur, "Kcur", il);
  8296. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8297. cb(Vcur, "Vcur", il);
  8298. if (model.layers[il].attn_q_norm) {
  8299. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  8300. ggml_element_size(Qcur) * n_embd_head,
  8301. ggml_element_size(Qcur) * n_embd_head * n_head,
  8302. 0);
  8303. cb(Qcur, "Qcur", il);
  8304. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  8305. model.layers[il].attn_q_norm,
  8306. model.layers[il].attn_q_norm_b,
  8307. LLM_NORM, cb, il);
  8308. cb(Qcur, "Qcur", il);
  8309. }
  8310. if (model.layers[il].attn_k_norm) {
  8311. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  8312. ggml_element_size(Kcur) * n_embd_head,
  8313. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  8314. 0);
  8315. cb(Kcur, "Kcur", il);
  8316. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  8317. model.layers[il].attn_k_norm,
  8318. model.layers[il].attn_k_norm_b,
  8319. LLM_NORM, cb, il);
  8320. cb(Kcur, "Kcur", il);
  8321. }
  8322. Qcur = ggml_rope_ext(
  8323. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  8324. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8325. ext_factor, attn_factor, beta_fast, beta_slow
  8326. );
  8327. cb(Qcur, "Qcur", il);
  8328. Kcur = ggml_rope_ext(
  8329. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  8330. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8331. ext_factor, attn_factor, beta_fast, beta_slow
  8332. );
  8333. cb(Kcur, "Kcur", il);
  8334. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8335. model.layers[il].wo, nullptr,
  8336. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8337. if (hparams.swin_norm) {
  8338. cur = llm_build_norm(ctx0, cur, hparams,
  8339. model.layers[il].attn_norm, NULL,
  8340. LLM_NORM_RMS, cb, il);
  8341. }
  8342. }
  8343. if (il == n_layer - 1) {
  8344. // skip computing output for unused tokens
  8345. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8346. n_tokens = n_outputs;
  8347. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8348. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8349. }
  8350. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8351. cb(ffn_inp, "ffn_inp", il);
  8352. // feed-forward network
  8353. if (!hparams.swin_norm) {
  8354. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8355. model.layers[il].ffn_norm, NULL,
  8356. LLM_NORM_RMS, cb, il);
  8357. cb(cur, "ffn_norm", il);
  8358. }
  8359. cur = llm_build_ffn(ctx0, lctx, cur,
  8360. model.layers[il].ffn_up, NULL, NULL,
  8361. model.layers[il].ffn_gate, NULL, NULL,
  8362. model.layers[il].ffn_down, NULL, NULL,
  8363. NULL,
  8364. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8365. cb(cur, "ffn_out", il);
  8366. if (hparams.swin_norm) {
  8367. cur = llm_build_norm(ctx0, cur, hparams,
  8368. model.layers[il].ffn_norm, NULL,
  8369. LLM_NORM_RMS, cb, il);
  8370. cb(cur, "ffn_norm", il);
  8371. }
  8372. cur = ggml_add(ctx0, cur, ffn_inp);
  8373. cb(cur, "ffn_out", il);
  8374. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8375. cb(cur, "l_out", il);
  8376. // input for next layer
  8377. inpL = cur;
  8378. }
  8379. cur = inpL;
  8380. cur = llm_build_norm(ctx0, cur, hparams,
  8381. model.output_norm, NULL,
  8382. LLM_NORM_RMS, cb, -1);
  8383. cb(cur, "result_norm", -1);
  8384. // lm_head
  8385. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8386. cb(cur, "result_output_with_img_logits", -1);
  8387. // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
  8388. // Needs to be removed once image outputs are supported.
  8389. int img_token_end_idx = 8196;
  8390. int img_token_start_idx = 4;
  8391. int num_img_tokens = img_token_end_idx - img_token_start_idx;
  8392. // creates 1d tensor of size num_img_tokens and values -FLT_MAX,
  8393. // which ensures that text token values are always at least larger than image token values
  8394. struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
  8395. img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
  8396. cb(img_logits, "img_logits", -1);
  8397. cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
  8398. cb(cur, "result_output", -1);
  8399. ggml_build_forward_expand(gf, cur);
  8400. return gf;
  8401. }
  8402. ggml_cgraph * build_solar() {
  8403. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8404. // mutable variable, needed during the last layer of the computation to skip unused tokens
  8405. int32_t n_tokens = this->n_tokens;
  8406. const int64_t n_embd_head = hparams.n_embd_head_v;
  8407. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8408. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8409. struct ggml_tensor * cur;
  8410. struct ggml_tensor * inpL;
  8411. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  8412. // inp_pos - contains the positions
  8413. struct ggml_tensor * inp_pos = build_inp_pos();
  8414. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8415. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8416. struct ggml_tensor * bskcn_1;
  8417. struct ggml_tensor * bskcn_2;
  8418. for (int il = 0; il < n_layer; ++il) {
  8419. struct ggml_tensor * inpSA = inpL;
  8420. if (hparams.n_bskcn(0, il)) {
  8421. bskcn_1 = inpSA;
  8422. }
  8423. if (hparams.n_bskcn(1, il)) {
  8424. bskcn_2 = inpSA;
  8425. }
  8426. if (hparams.n_bskcn(2, il)) {
  8427. inpSA = ggml_add(
  8428. ctx0,
  8429. ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
  8430. ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
  8431. }
  8432. if (hparams.n_bskcn(3, il)) {
  8433. inpSA = ggml_add(
  8434. ctx0,
  8435. ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
  8436. ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
  8437. }
  8438. // norm
  8439. cur = llm_build_norm(ctx0, inpL, hparams,
  8440. model.layers[il].attn_norm, NULL,
  8441. LLM_NORM_RMS, cb, il);
  8442. cb(cur, "attn_norm", il);
  8443. // self-attention
  8444. {
  8445. // rope freq factors for llama3; may return nullptr for llama2 and other models
  8446. struct ggml_tensor * rope_factors = build_rope_factors(il);
  8447. // compute Q and K and RoPE them
  8448. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8449. cb(Qcur, "Qcur", il);
  8450. if (model.layers[il].bq) {
  8451. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8452. cb(Qcur, "Qcur", il);
  8453. }
  8454. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8455. cb(Kcur, "Kcur", il);
  8456. if (model.layers[il].bk) {
  8457. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8458. cb(Kcur, "Kcur", il);
  8459. }
  8460. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8461. cb(Vcur, "Vcur", il);
  8462. if (model.layers[il].bv) {
  8463. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8464. cb(Vcur, "Vcur", il);
  8465. }
  8466. Qcur = ggml_rope_ext(
  8467. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  8468. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8469. ext_factor, attn_factor, beta_fast, beta_slow
  8470. );
  8471. cb(Qcur, "Qcur", il);
  8472. Kcur = ggml_rope_ext(
  8473. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  8474. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8475. ext_factor, attn_factor, beta_fast, beta_slow
  8476. );
  8477. cb(Kcur, "Kcur", il);
  8478. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8479. model.layers[il].wo, model.layers[il].bo,
  8480. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8481. }
  8482. if (il == n_layer - 1) {
  8483. // skip computing output for unused tokens
  8484. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8485. n_tokens = n_outputs;
  8486. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8487. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8488. }
  8489. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8490. cb(ffn_inp, "ffn_inp", il);
  8491. // feed-forward network
  8492. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8493. model.layers[il].ffn_norm, NULL,
  8494. LLM_NORM_RMS, cb, il);
  8495. cb(cur, "ffn_norm", il);
  8496. cur = llm_build_ffn(ctx0, lctx, cur,
  8497. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8498. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  8499. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8500. NULL,
  8501. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8502. cb(cur, "ffn_out", il);
  8503. cur = ggml_add(ctx0, cur, ffn_inp);
  8504. cb(cur, "ffn_out", il);
  8505. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8506. cb(cur, "l_out", il);
  8507. // input for next layer
  8508. inpL = cur;
  8509. }
  8510. cur = inpL;
  8511. cur = llm_build_norm(ctx0, cur, hparams,
  8512. model.output_norm, NULL,
  8513. LLM_NORM_RMS, cb, -1);
  8514. cb(cur, "result_norm", -1);
  8515. // lm_head
  8516. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8517. cb(cur, "result_output", -1);
  8518. ggml_build_forward_expand(gf, cur);
  8519. return gf;
  8520. }
  8521. struct ggml_cgraph * build_wavtokenizer_dec() {
  8522. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8523. struct ggml_tensor * cur;
  8524. struct ggml_tensor * inpL;
  8525. inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
  8526. cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
  8527. cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
  8528. cur = ggml_add(ctx0, cur, model.conv1d_b);
  8529. // posnet
  8530. for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
  8531. const auto & layer = model.layers[il].posnet;
  8532. inpL = cur;
  8533. switch (il) {
  8534. case 0:
  8535. case 1:
  8536. case 3:
  8537. case 4:
  8538. {
  8539. cur = llm_build_norm(ctx0, cur, hparams,
  8540. layer.norm1,
  8541. layer.norm1_b,
  8542. LLM_NORM_GROUP, cb, 0);
  8543. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  8544. cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
  8545. cur = ggml_add(ctx0, cur, layer.conv1_b);
  8546. cur = llm_build_norm(ctx0, cur, hparams,
  8547. layer.norm2,
  8548. layer.norm2_b,
  8549. LLM_NORM_GROUP, cb, 0);
  8550. cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
  8551. cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
  8552. cur = ggml_add(ctx0, cur, layer.conv2_b);
  8553. cur = ggml_add(ctx0, cur, inpL);
  8554. } break;
  8555. case 2:
  8556. {
  8557. cur = llm_build_norm(ctx0, cur, hparams,
  8558. layer.attn_norm,
  8559. layer.attn_norm_b,
  8560. LLM_NORM_GROUP, cb, 0);
  8561. struct ggml_tensor * q;
  8562. struct ggml_tensor * k;
  8563. struct ggml_tensor * v;
  8564. q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
  8565. k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
  8566. v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
  8567. q = ggml_add(ctx0, q, layer.attn_q_b);
  8568. k = ggml_add(ctx0, k, layer.attn_k_b);
  8569. v = ggml_add(ctx0, v, layer.attn_v_b);
  8570. q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
  8571. k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
  8572. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  8573. kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
  8574. cur = ggml_mul_mat(ctx0, kq, v);
  8575. cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
  8576. cur = ggml_add(ctx0, cur, layer.attn_o_b);
  8577. cur = ggml_add(ctx0, cur, inpL);
  8578. } break;
  8579. case 5:
  8580. {
  8581. cur = llm_build_norm(ctx0, cur, hparams,
  8582. layer.norm,
  8583. layer.norm_b,
  8584. LLM_NORM_GROUP, cb, 0);
  8585. } break;
  8586. default: GGML_ABORT("unknown posnet layer");
  8587. };
  8588. }
  8589. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  8590. cur = llm_build_norm(ctx0, cur, hparams,
  8591. model.tok_norm,
  8592. model.tok_norm_b,
  8593. LLM_NORM, cb, -1);
  8594. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  8595. inpL = cur;
  8596. // convnext
  8597. for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
  8598. const auto & layer = model.layers[il].convnext;
  8599. cur = inpL;
  8600. cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
  8601. cur = ggml_add(ctx0, cur, layer.dw_b);
  8602. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  8603. cur = llm_build_norm(ctx0, cur, hparams,
  8604. layer.norm,
  8605. layer.norm_b,
  8606. LLM_NORM, cb, -1);
  8607. cur = llm_build_ffn(ctx0, lctx, cur,
  8608. layer.pw1, layer.pw1_b, NULL,
  8609. NULL, NULL, NULL,
  8610. layer.pw2, layer.pw2_b, NULL,
  8611. NULL,
  8612. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  8613. cur = ggml_mul(ctx0, cur, layer.gamma);
  8614. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  8615. inpL = ggml_add(ctx0, cur, inpL);
  8616. }
  8617. cur = inpL;
  8618. cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
  8619. cur = llm_build_norm(ctx0, cur, hparams,
  8620. model.output_norm,
  8621. model.output_norm_b,
  8622. LLM_NORM, cb, -1);
  8623. // lm_head
  8624. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8625. cur = ggml_add(ctx0, cur, model.output_b);
  8626. cb(cur, "result_embd", -1);
  8627. ggml_build_forward_expand(gf, cur);
  8628. return gf;
  8629. }
  8630. };
  8631. static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<struct llama_kv_defrag_move> & moves) {
  8632. llama_ubatch dummy = {};
  8633. dummy.equal_seqs = true;
  8634. llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
  8635. struct llm_build_context llm(lctx, dummy, cb, false);
  8636. llm.init();
  8637. struct ggml_cgraph * result = llm.build_defrag(moves);
  8638. llm.free();
  8639. return result;
  8640. }
  8641. static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
  8642. llama_ubatch dummy = {};
  8643. dummy.equal_seqs = true;
  8644. llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
  8645. struct llm_build_context llm(lctx, dummy, cb, false);
  8646. llm.init();
  8647. struct ggml_cgraph * result = llm.build_k_shift();
  8648. llm.free();
  8649. return result;
  8650. }
  8651. static struct ggml_cgraph * llama_build_graph(
  8652. llama_context & lctx,
  8653. const llama_ubatch & ubatch,
  8654. bool worst_case) {
  8655. const auto & model = lctx.model;
  8656. // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
  8657. llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
  8658. if (il >= 0) {
  8659. ggml_format_name(cur, "%s-%d", name, il);
  8660. } else {
  8661. ggml_set_name(cur, name);
  8662. }
  8663. if (!lctx.cparams.offload_kqv) {
  8664. if (strcmp(name, "kqv_merged_cont") == 0) {
  8665. // all nodes between the KV store and the attention output are run on the CPU
  8666. ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu);
  8667. }
  8668. }
  8669. // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
  8670. // FIXME: fix in ggml_backend_sched
  8671. const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
  8672. if (ubatch.n_tokens < 32 || full_offload) {
  8673. if (il != -1 && strcmp(name, "norm") == 0) {
  8674. const auto & dev_layer = lctx.model.dev_layer.at(il);
  8675. for (auto & backend : lctx.backends) {
  8676. if (ggml_backend_get_device(backend.get()) == dev_layer.dev) {
  8677. if (ggml_backend_supports_op(backend.get(), cur)) {
  8678. ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, backend.get());
  8679. }
  8680. }
  8681. }
  8682. }
  8683. }
  8684. };
  8685. struct ggml_cgraph * result = NULL;
  8686. struct llm_build_context llm(lctx, ubatch, cb, worst_case);
  8687. llm.init();
  8688. switch (model.arch) {
  8689. case LLM_ARCH_LLAMA:
  8690. case LLM_ARCH_MINICPM:
  8691. case LLM_ARCH_GRANITE:
  8692. case LLM_ARCH_GRANITE_MOE:
  8693. {
  8694. result = llm.build_llama();
  8695. } break;
  8696. case LLM_ARCH_MLLAMA:
  8697. {
  8698. result = llm.build_mllama();
  8699. } break;
  8700. case LLM_ARCH_DECI:
  8701. {
  8702. result = llm.build_deci();
  8703. } break;
  8704. case LLM_ARCH_BAICHUAN:
  8705. {
  8706. result = llm.build_baichuan();
  8707. } break;
  8708. case LLM_ARCH_FALCON:
  8709. {
  8710. result = llm.build_falcon();
  8711. } break;
  8712. case LLM_ARCH_GROK:
  8713. {
  8714. result = llm.build_grok();
  8715. } break;
  8716. case LLM_ARCH_STARCODER:
  8717. {
  8718. result = llm.build_starcoder();
  8719. } break;
  8720. case LLM_ARCH_REFACT:
  8721. {
  8722. result = llm.build_refact();
  8723. } break;
  8724. case LLM_ARCH_BERT:
  8725. case LLM_ARCH_JINA_BERT_V2:
  8726. case LLM_ARCH_NOMIC_BERT:
  8727. {
  8728. result = llm.build_bert();
  8729. } break;
  8730. case LLM_ARCH_BLOOM:
  8731. {
  8732. result = llm.build_bloom();
  8733. } break;
  8734. case LLM_ARCH_MPT:
  8735. {
  8736. result = llm.build_mpt();
  8737. } break;
  8738. case LLM_ARCH_STABLELM:
  8739. {
  8740. result = llm.build_stablelm();
  8741. } break;
  8742. case LLM_ARCH_QWEN:
  8743. {
  8744. result = llm.build_qwen();
  8745. } break;
  8746. case LLM_ARCH_QWEN2:
  8747. {
  8748. result = llm.build_qwen2();
  8749. } break;
  8750. case LLM_ARCH_QWEN2VL:
  8751. {
  8752. lctx.n_pos_per_token = 4;
  8753. result = llm.build_qwen2vl();
  8754. } break;
  8755. case LLM_ARCH_QWEN2MOE:
  8756. {
  8757. result = llm.build_qwen2moe();
  8758. } break;
  8759. case LLM_ARCH_PHI2:
  8760. {
  8761. result = llm.build_phi2();
  8762. } break;
  8763. case LLM_ARCH_PHI3:
  8764. {
  8765. result = llm.build_phi3();
  8766. } break;
  8767. case LLM_ARCH_PLAMO:
  8768. {
  8769. result = llm.build_plamo();
  8770. } break;
  8771. case LLM_ARCH_GPT2:
  8772. {
  8773. result = llm.build_gpt2();
  8774. } break;
  8775. case LLM_ARCH_CODESHELL:
  8776. {
  8777. result = llm.build_codeshell();
  8778. } break;
  8779. case LLM_ARCH_ORION:
  8780. {
  8781. result = llm.build_orion();
  8782. } break;
  8783. case LLM_ARCH_INTERNLM2:
  8784. {
  8785. result = llm.build_internlm2();
  8786. } break;
  8787. case LLM_ARCH_MINICPM3:
  8788. {
  8789. result = llm.build_minicpm3();
  8790. } break;
  8791. case LLM_ARCH_GEMMA:
  8792. {
  8793. result = llm.build_gemma();
  8794. } break;
  8795. case LLM_ARCH_GEMMA2:
  8796. {
  8797. result = llm.build_gemma2();
  8798. } break;
  8799. case LLM_ARCH_STARCODER2:
  8800. {
  8801. result = llm.build_starcoder2();
  8802. } break;
  8803. case LLM_ARCH_MAMBA:
  8804. {
  8805. result = llm.build_mamba();
  8806. } break;
  8807. case LLM_ARCH_XVERSE:
  8808. {
  8809. result = llm.build_xverse();
  8810. } break;
  8811. case LLM_ARCH_COMMAND_R:
  8812. {
  8813. result = llm.build_command_r();
  8814. } break;
  8815. case LLM_ARCH_COHERE2:
  8816. {
  8817. result = llm.build_cohere2();
  8818. } break;
  8819. case LLM_ARCH_DBRX:
  8820. {
  8821. result = llm.build_dbrx();
  8822. } break;
  8823. case LLM_ARCH_OLMO:
  8824. {
  8825. result = llm.build_olmo();
  8826. } break;
  8827. case LLM_ARCH_OLMO2:
  8828. {
  8829. result = llm.build_olmo2();
  8830. } break;
  8831. case LLM_ARCH_OLMOE:
  8832. {
  8833. result = llm.build_olmoe();
  8834. } break;
  8835. case LLM_ARCH_OPENELM:
  8836. {
  8837. result = llm.build_openelm();
  8838. } break;
  8839. case LLM_ARCH_GPTNEOX:
  8840. {
  8841. result = llm.build_gptneox();
  8842. } break;
  8843. case LLM_ARCH_ARCTIC:
  8844. {
  8845. result = llm.build_arctic();
  8846. } break;
  8847. case LLM_ARCH_DEEPSEEK:
  8848. {
  8849. result = llm.build_deepseek();
  8850. } break;
  8851. case LLM_ARCH_DEEPSEEK2:
  8852. {
  8853. result = llm.build_deepseek2();
  8854. } break;
  8855. case LLM_ARCH_CHATGLM:
  8856. {
  8857. result = llm.build_chatglm();
  8858. } break;
  8859. case LLM_ARCH_BITNET:
  8860. {
  8861. result = llm.build_bitnet();
  8862. } break;
  8863. case LLM_ARCH_T5:
  8864. {
  8865. if (lctx.is_encoding) {
  8866. result = llm.build_t5_enc();
  8867. } else {
  8868. result = llm.build_t5_dec();
  8869. }
  8870. } break;
  8871. case LLM_ARCH_T5ENCODER:
  8872. {
  8873. result = llm.build_t5_enc();
  8874. } break;
  8875. case LLM_ARCH_JAIS:
  8876. {
  8877. result = llm.build_jais();
  8878. } break;
  8879. case LLM_ARCH_NEMOTRON:
  8880. {
  8881. result = llm.build_nemotron();
  8882. } break;
  8883. case LLM_ARCH_EXAONE:
  8884. {
  8885. result = llm.build_exaone();
  8886. } break;
  8887. case LLM_ARCH_RWKV6:
  8888. {
  8889. result = llm.build_rwkv6();
  8890. } break;
  8891. case LLM_ARCH_CHAMELEON:
  8892. {
  8893. result = llm.build_chameleon();
  8894. } break;
  8895. case LLM_ARCH_SOLAR:
  8896. {
  8897. result = llm.build_solar();
  8898. } break;
  8899. case LLM_ARCH_WAVTOKENIZER_DEC:
  8900. {
  8901. result = llm.build_wavtokenizer_dec();
  8902. } break;
  8903. default:
  8904. GGML_ABORT("fatal error");
  8905. }
  8906. // add on pooling layer
  8907. if (lctx.cparams.embeddings) {
  8908. result = llm.append_pooling(result);
  8909. }
  8910. llm.free();
  8911. return result;
  8912. }
  8913. // returns the result of ggml_backend_sched_graph_compute_async execution
  8914. static enum ggml_status llama_graph_compute(
  8915. llama_context & lctx,
  8916. ggml_cgraph * gf,
  8917. int n_threads,
  8918. ggml_threadpool * threadpool) {
  8919. if (lctx.backend_cpu != nullptr) {
  8920. auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
  8921. auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
  8922. set_threadpool_fn(lctx.backend_cpu, threadpool);
  8923. }
  8924. // set the number of threads for all the backends
  8925. for (const auto & set_n_threads_fn : lctx.set_n_threads_fns) {
  8926. set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
  8927. }
  8928. auto status = ggml_backend_sched_graph_compute_async(lctx.sched.get(), gf);
  8929. if (status != GGML_STATUS_SUCCESS) {
  8930. LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
  8931. }
  8932. // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
  8933. return status;
  8934. }
  8935. // decode a batch of tokens by evaluating the transformer
  8936. // in case of unsuccessful decoding (error or warning),
  8937. // the kv_cache state will be returned to its original state
  8938. // (for non-recurrent models) or cleaned (for recurrent models)
  8939. //
  8940. // - lctx: llama context
  8941. // - batch: batch to evaluate
  8942. //
  8943. // return 0 on success
  8944. // return positive int on warning
  8945. // return negative int on error
  8946. //
  8947. static int llama_decode_internal(
  8948. llama_context & lctx,
  8949. llama_batch inp_batch) {
  8950. lctx.is_encoding = false;
  8951. if (inp_batch.n_tokens == 0) {
  8952. LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
  8953. return -1;
  8954. }
  8955. // temporary allocate memory for the input batch if needed
  8956. llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
  8957. const llama_batch & batch = batch_allocr.batch;
  8958. const uint32_t n_tokens_all = batch.n_tokens;
  8959. const auto & model = lctx.model;
  8960. const auto & hparams = model.hparams;
  8961. const auto & cparams = lctx.cparams;
  8962. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  8963. if (batch.token) {
  8964. for (uint32_t i = 0; i < n_tokens_all; ++i) {
  8965. if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
  8966. LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
  8967. return -1;
  8968. }
  8969. }
  8970. }
  8971. GGML_ASSERT(n_tokens_all <= cparams.n_batch);
  8972. GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
  8973. if (lctx.t_compute_start_us == 0) {
  8974. lctx.t_compute_start_us = ggml_time_us();
  8975. }
  8976. lctx.n_queued_tokens += n_tokens_all;
  8977. auto & kv_self = lctx.kv_self;
  8978. llama_kv_slot_restorer kv_slot_restorer(kv_self);
  8979. const int64_t n_embd = hparams.n_embd;
  8980. const int64_t n_vocab = hparams.n_vocab;
  8981. uint32_t n_outputs = 0;
  8982. uint32_t n_outputs_prev = 0;
  8983. const auto n_ubatch = cparams.n_ubatch;
  8984. // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
  8985. const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
  8986. lctx.embd_seq.clear();
  8987. // count outputs
  8988. if (batch.logits && !embd_pooled) {
  8989. for (uint32_t i = 0; i < n_tokens_all; ++i) {
  8990. n_outputs += batch.logits[i] != 0;
  8991. }
  8992. } else if (lctx.logits_all || embd_pooled) {
  8993. n_outputs = n_tokens_all;
  8994. } else {
  8995. // keep last output only
  8996. n_outputs = 1;
  8997. }
  8998. lctx.sbatch.from_batch(batch, batch.n_embd,
  8999. /* simple_split */ !kv_self.recurrent,
  9000. /* logits_all */ n_outputs == n_tokens_all);
  9001. // reserve output buffer
  9002. if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
  9003. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
  9004. return -2;
  9005. };
  9006. while (lctx.sbatch.n_tokens > 0) {
  9007. llama_ubatch ubatch;
  9008. if (kv_self.recurrent) {
  9009. if (embd_pooled) {
  9010. // Pooled embeddings cannot be split across ubatches (yet)
  9011. ubatch = lctx.sbatch.split_seq(n_ubatch);
  9012. } else {
  9013. // recurrent model architectures are easier to implement
  9014. // with equal-length sequences
  9015. ubatch = lctx.sbatch.split_equal(n_ubatch);
  9016. }
  9017. } else {
  9018. ubatch = lctx.sbatch.split_simple(n_ubatch);
  9019. }
  9020. const uint32_t n_tokens = ubatch.n_tokens;
  9021. // count the outputs in this u_batch
  9022. {
  9023. int32_t n_outputs_new = 0;
  9024. if (n_outputs == n_tokens_all) {
  9025. n_outputs_new = n_tokens;
  9026. } else {
  9027. GGML_ASSERT(ubatch.output);
  9028. for (uint32_t i = 0; i < n_tokens; i++) {
  9029. n_outputs_new += (int32_t) (ubatch.output[i] != 0);
  9030. }
  9031. }
  9032. // needs to happen before the graph is built
  9033. lctx.n_outputs = n_outputs_new;
  9034. }
  9035. int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
  9036. ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
  9037. GGML_ASSERT(n_threads > 0);
  9038. // non-causal masks do not use the KV cache
  9039. if (hparams.causal_attn) {
  9040. llama_kv_cache_update(&lctx);
  9041. // if we have enough unused cells before the current head ->
  9042. // better to start searching from the beginning of the cache, hoping to fill it
  9043. if (kv_self.head > kv_self.used + 2*n_tokens) {
  9044. kv_self.head = 0;
  9045. }
  9046. auto slot = llama_kv_cache_find_slot(kv_self, ubatch);
  9047. if (!slot) {
  9048. llama_kv_cache_defrag(kv_self);
  9049. llama_kv_cache_update(&lctx);
  9050. slot = llama_kv_cache_find_slot(kv_self, ubatch);
  9051. }
  9052. if (!slot) {
  9053. return 1;
  9054. }
  9055. kv_slot_restorer.save(slot);
  9056. if (!kv_self.recurrent) {
  9057. // a heuristic, to avoid attending the full cache if it is not yet utilized
  9058. // after enough generations, the benefit from this heuristic disappears
  9059. // if we start defragmenting the cache, the benefit from this will be more important
  9060. const uint32_t pad = llama_kv_cache_get_padding(cparams);
  9061. kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
  9062. //kv_self.n = llama_kv_cache_cell_max(kv_self);
  9063. }
  9064. }
  9065. //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
  9066. ggml_backend_sched_reset(lctx.sched.get());
  9067. ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  9068. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  9069. // the output is always the last tensor in the graph
  9070. struct ggml_tensor * res = ggml_graph_node(gf, -1);
  9071. struct ggml_tensor * embd = ggml_graph_node(gf, -2);
  9072. if (lctx.n_outputs == 0) {
  9073. // no output
  9074. res = nullptr;
  9075. embd = nullptr;
  9076. } else if (cparams.embeddings) {
  9077. embd = nullptr;
  9078. for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
  9079. if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
  9080. embd = ggml_graph_node(gf, i);
  9081. break;
  9082. }
  9083. }
  9084. } else {
  9085. embd = nullptr; // do not extract embeddings when not needed
  9086. GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
  9087. }
  9088. if (!cparams.causal_attn) {
  9089. res = nullptr; // do not extract logits when not needed
  9090. }
  9091. // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  9092. ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  9093. llama_set_inputs(lctx, ubatch);
  9094. const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
  9095. if (compute_status != GGML_STATUS_SUCCESS) {
  9096. kv_slot_restorer.restore(kv_self);
  9097. switch (compute_status) {
  9098. case GGML_STATUS_ABORTED:
  9099. return 2;
  9100. case GGML_STATUS_ALLOC_FAILED:
  9101. return -2;
  9102. case GGML_STATUS_FAILED:
  9103. default:
  9104. return -3;
  9105. }
  9106. }
  9107. // update the kv ring buffer
  9108. {
  9109. kv_self.head += n_tokens;
  9110. // Ensure kv cache head points to a valid index.
  9111. if (kv_self.head >= kv_self.size) {
  9112. kv_self.head = 0;
  9113. }
  9114. }
  9115. // plot the computation graph in dot format (for debugging purposes)
  9116. //if (n_past%100 == 0) {
  9117. // ggml_graph_dump_dot(gf, NULL, "llama.dot");
  9118. //}
  9119. // extract logits
  9120. if (res) {
  9121. ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), res);
  9122. GGML_ASSERT(backend_res != nullptr);
  9123. GGML_ASSERT(lctx.logits != nullptr);
  9124. float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
  9125. const int32_t n_outputs_new = lctx.n_outputs;
  9126. if (n_outputs_new) {
  9127. GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
  9128. GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
  9129. ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
  9130. }
  9131. }
  9132. // extract embeddings
  9133. if (embd) {
  9134. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
  9135. GGML_ASSERT(backend_embd != nullptr);
  9136. switch (cparams.pooling_type) {
  9137. case LLAMA_POOLING_TYPE_NONE:
  9138. {
  9139. // extract token embeddings
  9140. GGML_ASSERT(lctx.embd != nullptr);
  9141. float * embd_out = lctx.embd + n_outputs_prev*n_embd;
  9142. const int32_t n_outputs_new = lctx.n_outputs;
  9143. if (n_outputs_new) {
  9144. GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
  9145. GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
  9146. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
  9147. }
  9148. } break;
  9149. case LLAMA_POOLING_TYPE_MEAN:
  9150. case LLAMA_POOLING_TYPE_CLS:
  9151. case LLAMA_POOLING_TYPE_LAST:
  9152. {
  9153. // extract sequence embeddings (cleared before processing each batch)
  9154. auto & embd_seq_out = lctx.embd_seq;
  9155. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  9156. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  9157. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  9158. continue;
  9159. }
  9160. embd_seq_out[seq_id].resize(n_embd);
  9161. ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  9162. }
  9163. } break;
  9164. case LLAMA_POOLING_TYPE_RANK:
  9165. {
  9166. // extract the rerank score - a single float per sequence
  9167. auto & embd_seq_out = lctx.embd_seq;
  9168. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  9169. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  9170. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  9171. continue;
  9172. }
  9173. embd_seq_out[seq_id].resize(1);
  9174. ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
  9175. }
  9176. } break;
  9177. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  9178. {
  9179. GGML_ABORT("unknown pooling type");
  9180. }
  9181. }
  9182. }
  9183. n_outputs_prev += lctx.n_outputs;
  9184. }
  9185. // set output mappings
  9186. {
  9187. bool sorted_output = true;
  9188. GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs);
  9189. for (size_t i = 0; i < n_outputs; ++i) {
  9190. size_t out_id = lctx.sbatch.out_ids[i];
  9191. lctx.output_ids[out_id] = i;
  9192. if (out_id != i) {
  9193. sorted_output = false;
  9194. }
  9195. }
  9196. if (sorted_output) {
  9197. lctx.sbatch.out_ids.clear();
  9198. }
  9199. }
  9200. // set to total number of outputs in the batch, for use in llama_get_logits_ith
  9201. lctx.n_outputs = n_outputs;
  9202. // wait for the computation to finish (automatically done when obtaining the model output)
  9203. //llama_synchronize(&lctx);
  9204. // decide if we need to defrag the kv cache
  9205. if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
  9206. const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
  9207. // queue defragmentation for next llama_kv_cache_update
  9208. if (fragmentation > cparams.defrag_thold) {
  9209. //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
  9210. llama_kv_cache_defrag(kv_self);
  9211. }
  9212. }
  9213. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  9214. // overlap with device computation.
  9215. ggml_backend_sched_reset(lctx.sched.get());
  9216. return 0;
  9217. }
  9218. // encode a batch of tokens by evaluating the encoder part of the transformer
  9219. //
  9220. // - lctx: llama context
  9221. // - batch: batch to evaluate
  9222. //
  9223. // return 0 on success
  9224. // return positive int on warning
  9225. // return negative int on error
  9226. //
  9227. static int llama_encode_internal(
  9228. llama_context & lctx,
  9229. llama_batch inp_batch) {
  9230. lctx.is_encoding = true;
  9231. if (inp_batch.n_tokens == 0) {
  9232. LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
  9233. return -1;
  9234. }
  9235. // temporary allocate memory for the input batch if needed
  9236. llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : lctx.kv_self.max_pos() + 1);
  9237. const llama_batch & batch = batch_allocr.batch;
  9238. const uint32_t n_tokens = batch.n_tokens;
  9239. const auto & model = lctx.model;
  9240. const auto & hparams = model.hparams;
  9241. const auto & cparams = lctx.cparams;
  9242. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  9243. if (batch.token) {
  9244. for (uint32_t i = 0; i < n_tokens; ++i) {
  9245. if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
  9246. LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
  9247. return -1;
  9248. }
  9249. }
  9250. }
  9251. // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
  9252. GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
  9253. if (lctx.t_compute_start_us == 0) {
  9254. lctx.t_compute_start_us = ggml_time_us();
  9255. }
  9256. lctx.n_queued_tokens += n_tokens;
  9257. const int64_t n_embd = hparams.n_embd;
  9258. lctx.sbatch.from_batch(batch, batch.n_embd, /* simple_split */ true, /* logits_all */ true);
  9259. const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
  9260. // reserve output buffer
  9261. if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
  9262. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
  9263. return -2;
  9264. };
  9265. for (uint32_t i = 0; i < n_tokens; ++i) {
  9266. lctx.output_ids[i] = i;
  9267. }
  9268. lctx.inp_embd_enc = NULL;
  9269. lctx.n_outputs = n_tokens;
  9270. int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
  9271. ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
  9272. GGML_ASSERT(n_threads > 0);
  9273. ggml_backend_sched_reset(lctx.sched.get());
  9274. ggml_backend_sched_set_eval_callback(lctx.sched.get(), lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  9275. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  9276. // the output embeddings after the final encoder normalization
  9277. struct ggml_tensor * embd = nullptr;
  9278. // there are two cases here
  9279. if (llama_model_has_decoder(&lctx.model)) {
  9280. // first case is an encoder-decoder T5 model where embeddings are passed to decoder
  9281. embd = ggml_graph_node(gf, -1);
  9282. GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
  9283. } else {
  9284. // second case is an encoder-only T5 model
  9285. if (cparams.embeddings) {
  9286. // only output embeddings if required
  9287. embd = ggml_graph_node(gf, -1);
  9288. if (strcmp(embd->name, "result_embd_pooled") != 0) {
  9289. embd = ggml_graph_node(gf, -2);
  9290. }
  9291. GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
  9292. }
  9293. }
  9294. ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  9295. llama_set_inputs(lctx, ubatch);
  9296. const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
  9297. switch (compute_status) {
  9298. case GGML_STATUS_SUCCESS:
  9299. break;
  9300. case GGML_STATUS_ABORTED:
  9301. return 2;
  9302. case GGML_STATUS_ALLOC_FAILED:
  9303. return -2;
  9304. case GGML_STATUS_FAILED:
  9305. default:
  9306. return -3;
  9307. }
  9308. // extract embeddings
  9309. if (embd) {
  9310. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched.get(), embd);
  9311. GGML_ASSERT(backend_embd != nullptr);
  9312. if (llama_model_has_decoder(&lctx.model)) {
  9313. lctx.embd_enc.resize(n_tokens*n_embd);
  9314. float * embd_out = lctx.embd_enc.data();
  9315. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
  9316. GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
  9317. // remember the sequence ids used during the encoding - needed for cross attention later
  9318. lctx.seq_ids_enc.resize(n_tokens);
  9319. for (uint32_t i = 0; i < n_tokens; i++) {
  9320. for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
  9321. llama_seq_id seq_id = ubatch.seq_id[i][s];
  9322. lctx.seq_ids_enc[i].insert(seq_id);
  9323. }
  9324. }
  9325. } else {
  9326. GGML_ASSERT(lctx.embd != nullptr);
  9327. switch (cparams.pooling_type) {
  9328. case LLAMA_POOLING_TYPE_NONE:
  9329. {
  9330. // extract token embeddings
  9331. GGML_ASSERT(lctx.embd != nullptr);
  9332. float * embd_out = lctx.embd;
  9333. GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
  9334. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
  9335. } break;
  9336. case LLAMA_POOLING_TYPE_MEAN:
  9337. case LLAMA_POOLING_TYPE_CLS:
  9338. case LLAMA_POOLING_TYPE_LAST:
  9339. {
  9340. // extract sequence embeddings
  9341. auto & embd_seq_out = lctx.embd_seq;
  9342. embd_seq_out.clear();
  9343. GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
  9344. for (uint32_t i = 0; i < n_tokens; i++) {
  9345. const llama_seq_id seq_id = ubatch.seq_id[i][0];
  9346. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  9347. continue;
  9348. }
  9349. embd_seq_out[seq_id].resize(n_embd);
  9350. ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  9351. }
  9352. } break;
  9353. case LLAMA_POOLING_TYPE_RANK:
  9354. {
  9355. // TODO: this likely should be the same logic as in llama_decoder_internal, but better to
  9356. // wait for an encoder model that requires this pooling type in order to test it
  9357. // https://github.com/ggerganov/llama.cpp/pull/9510
  9358. GGML_ABORT("RANK pooling not implemented yet");
  9359. }
  9360. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  9361. {
  9362. GGML_ABORT("unknown pooling type");
  9363. }
  9364. }
  9365. }
  9366. }
  9367. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  9368. // overlap with device computation.
  9369. ggml_backend_sched_reset(lctx.sched.get());
  9370. return 0;
  9371. }
  9372. // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
  9373. static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
  9374. auto & kv_self = lctx.kv_self;
  9375. const auto & hparams = lctx.model.hparams;
  9376. const uint32_t n_layer = hparams.n_layer;
  9377. const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
  9378. const uint32_t n_used = kv_self.used;
  9379. assert(n_used <= n_kv);
  9380. //const int64_t t_start = ggml_time_us();
  9381. // groups of cells moved
  9382. std::vector<struct llama_kv_defrag_move> moves;
  9383. // each move requires 6*n_layer tensors (see build_defrag)
  9384. // - source view, destination view, copy operation
  9385. // - x2 for keys and values
  9386. //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
  9387. // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
  9388. const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
  9389. // determine which KV cells to move where
  9390. //
  9391. // cell i moves to ids[i]
  9392. //
  9393. // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
  9394. //
  9395. std::vector<uint32_t> ids(n_kv, n_kv);
  9396. for (uint32_t i0 = 0; i0 < n_used; ++i0) {
  9397. const auto & cell0 = kv_self.cells[i0];
  9398. if (!cell0.is_empty()) {
  9399. ids[i0] = i0;
  9400. continue;
  9401. }
  9402. // found a hole - fill it with data from the end of the cache
  9403. uint32_t nh = 1;
  9404. // determine the size of the hole
  9405. while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
  9406. nh++;
  9407. }
  9408. uint32_t nf = 0;
  9409. uint32_t is = n_kv - 1;
  9410. // starting from the end, find nh non-empty cells
  9411. for (; is > i0; --is) {
  9412. const auto & cell1 = kv_self.cells[is];
  9413. if (cell1.is_empty() || ids[is] != n_kv) {
  9414. continue;
  9415. }
  9416. // non-empty cell which is not yet moved
  9417. nf++;
  9418. if (nf == nh) {
  9419. break;
  9420. }
  9421. }
  9422. // this can only happen if `n_used` is not accurate, which would be a bug
  9423. GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
  9424. nf = 0;
  9425. uint32_t i1 = is;
  9426. // are we moving a continuous block of memory?
  9427. bool cont = false;
  9428. // go back and move the nf cells to the hole
  9429. for (; i1 < n_kv; ++i1) {
  9430. auto & cell1 = kv_self.cells[i1];
  9431. if (cell1.is_empty() || ids[i1] != n_kv) {
  9432. cont = false;
  9433. continue;
  9434. }
  9435. // this cell goes to (i0 + nf)
  9436. ids[i1] = i0 + nf;
  9437. // move the cell meta data
  9438. kv_self.cells[i0 + nf] = cell1;
  9439. // clear the old cell and move the head there
  9440. cell1 = llama_kv_cell();
  9441. kv_self.head = n_used;
  9442. if (!cont) {
  9443. moves.push_back({i1, i0 + nf, 1});
  9444. cont = true;
  9445. } else {
  9446. moves.back().len++;
  9447. }
  9448. nf++;
  9449. if (nf == nh) {
  9450. break;
  9451. }
  9452. }
  9453. //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
  9454. i0 += nh - 1;
  9455. }
  9456. if (moves.size() == 0) {
  9457. return;
  9458. }
  9459. //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", moves.size());
  9460. #if 0
  9461. // CPU defrag
  9462. //
  9463. // TODO: optimizations are possible:
  9464. // - multiple threads
  9465. // - avoid copying to the host memory when already there
  9466. //
  9467. // likely not worth the effort, as we have ggml_graph based defrag
  9468. //
  9469. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  9470. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  9471. const uint32_t kv_size = kv_self.size;
  9472. std::vector<uint8_t> buf_k;
  9473. std::vector<uint8_t> buf_v;
  9474. for (uint32_t il = 0; il < n_layer; ++il) {
  9475. const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  9476. const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
  9477. const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  9478. const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
  9479. buf_k.resize(k_size);
  9480. buf_v.resize(v_size);
  9481. ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
  9482. ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
  9483. // batch move [i, i+nm) to [id, id+nm)
  9484. // note: cells can move only to a lower index
  9485. for (uint32_t i = 0; i < n_kv; ++i) {
  9486. const uint32_t id = ids[i];
  9487. if (i == id || id == n_kv) {
  9488. continue;
  9489. }
  9490. uint32_t nm = 1;
  9491. while (i + nm < n_kv && ids[i + nm] == id + nm) {
  9492. nm++;
  9493. }
  9494. // move keys
  9495. {
  9496. const int64_t os = i*k_size_row;
  9497. const int64_t od = id*k_size_row;
  9498. memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
  9499. }
  9500. // move values (note: they are transposed)
  9501. {
  9502. const int64_t os = i;
  9503. const int64_t od = id;
  9504. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  9505. memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
  9506. }
  9507. }
  9508. i += nm - 1;
  9509. }
  9510. ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
  9511. ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
  9512. }
  9513. #else
  9514. // ggml_graph defrag
  9515. for (std::size_t i = 0; i < moves.size(); i += max_moves) {
  9516. std::vector<struct llama_kv_defrag_move> chunk;
  9517. auto end = std::min(i + max_moves, moves.size());
  9518. chunk.assign(moves.begin() + i, moves.begin() + end);
  9519. ggml_backend_sched_reset(lctx.sched.get());
  9520. //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*chunk.size()*n_layer);
  9521. ggml_cgraph * gf = llama_build_graph_defrag(lctx, chunk);
  9522. llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  9523. }
  9524. #endif
  9525. //const int64_t t_end = ggml_time_us();
  9526. //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
  9527. }
  9528. static void llama_kv_cache_update_internal(struct llama_context & lctx) {
  9529. bool need_reserve = false;
  9530. if (lctx.kv_self.has_shift) {
  9531. if (!llama_kv_cache_can_shift(&lctx)) {
  9532. GGML_ABORT("The current context does not support K-shift");
  9533. }
  9534. // apply K-shift if needed
  9535. if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
  9536. ggml_backend_sched_reset(lctx.sched.get());
  9537. ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
  9538. ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
  9539. llama_set_k_shift(lctx);
  9540. llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  9541. need_reserve = true;
  9542. }
  9543. {
  9544. auto & kv_self = lctx.kv_self;
  9545. kv_self.has_shift = false;
  9546. for (uint32_t i = 0; i < kv_self.size; ++i) {
  9547. kv_self.cells[i].delta = 0;
  9548. }
  9549. }
  9550. }
  9551. // defragment the KV cache if needed
  9552. if (lctx.kv_self.do_defrag) {
  9553. llama_kv_cache_defrag_internal(lctx);
  9554. need_reserve = true;
  9555. lctx.kv_self.do_defrag = false;
  9556. }
  9557. // reserve a worst case graph again
  9558. if (need_reserve) {
  9559. // TODO: extract to a function
  9560. // build worst-case graph
  9561. uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  9562. uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
  9563. llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  9564. llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  9565. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
  9566. // initialize scheduler with the worst-case graph
  9567. ggml_backend_sched_reset(lctx.sched.get());
  9568. if (!ggml_backend_sched_reserve(lctx.sched.get(), gf)) {
  9569. LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
  9570. }
  9571. }
  9572. }
  9573. int32_t llama_lora_adapter_set(
  9574. struct llama_context * ctx,
  9575. struct llama_lora_adapter * adapter,
  9576. float scale) {
  9577. if (ctx->cparams.flash_attn) {
  9578. LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
  9579. return -1;
  9580. }
  9581. ctx->lora_adapters[adapter] = scale;
  9582. return 0;
  9583. }
  9584. int32_t llama_lora_adapter_remove(
  9585. struct llama_context * ctx,
  9586. struct llama_lora_adapter * adapter) {
  9587. auto pos = ctx->lora_adapters.find(adapter);
  9588. if (pos != ctx->lora_adapters.end()) {
  9589. ctx->lora_adapters.erase(pos);
  9590. return 0;
  9591. }
  9592. return -1;
  9593. }
  9594. void llama_lora_adapter_clear(struct llama_context * ctx) {
  9595. ctx->lora_adapters.clear();
  9596. }
  9597. // TODO: tmp
  9598. int32_t llama_control_vector_apply(
  9599. struct llama_context * lctx,
  9600. const float * data,
  9601. size_t len,
  9602. int32_t n_embd,
  9603. int32_t il_start,
  9604. int32_t il_end) {
  9605. return llama_control_vector_apply(lctx->cvec, lctx->model, data, len, n_embd, il_start, il_end);
  9606. }
  9607. //
  9608. // interface implementation
  9609. //
  9610. struct llama_context_params llama_context_default_params() {
  9611. struct llama_context_params result = {
  9612. /*.n_ctx =*/ 512,
  9613. /*.n_batch =*/ 2048,
  9614. /*.n_ubatch =*/ 512,
  9615. /*.n_seq_max =*/ 1,
  9616. /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
  9617. /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
  9618. /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
  9619. /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
  9620. /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
  9621. /*.rope_freq_base =*/ 0.0f,
  9622. /*.rope_freq_scale =*/ 0.0f,
  9623. /*.yarn_ext_factor =*/ -1.0f,
  9624. /*.yarn_attn_factor =*/ 1.0f,
  9625. /*.yarn_beta_fast =*/ 32.0f,
  9626. /*.yarn_beta_slow =*/ 1.0f,
  9627. /*.yarn_orig_ctx =*/ 0,
  9628. /*.defrag_thold =*/ -1.0f,
  9629. /*.cb_eval =*/ nullptr,
  9630. /*.cb_eval_user_data =*/ nullptr,
  9631. /*.type_k =*/ GGML_TYPE_F16,
  9632. /*.type_v =*/ GGML_TYPE_F16,
  9633. /*.logits_all =*/ false,
  9634. /*.embeddings =*/ false,
  9635. /*.offload_kqv =*/ true,
  9636. /*.flash_attn =*/ false,
  9637. /*.no_perf =*/ true,
  9638. /*.cross_attn =*/ false,
  9639. /*.abort_callback =*/ nullptr,
  9640. /*.abort_callback_data =*/ nullptr,
  9641. };
  9642. return result;
  9643. }
  9644. struct llama_sampler_chain_params llama_sampler_chain_default_params() {
  9645. struct llama_sampler_chain_params result = {
  9646. /*.no_perf =*/ true,
  9647. };
  9648. return result;
  9649. }
  9650. size_t llama_max_devices(void) {
  9651. return 16;
  9652. }
  9653. bool llama_supports_mmap(void) {
  9654. return llama_mmap::SUPPORTED;
  9655. }
  9656. bool llama_supports_mlock(void) {
  9657. return llama_mlock::SUPPORTED;
  9658. }
  9659. bool llama_supports_gpu_offload(void) {
  9660. return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
  9661. llama_supports_rpc();
  9662. }
  9663. bool llama_supports_rpc(void) {
  9664. return ggml_backend_reg_by_name("RPC") != nullptr;
  9665. }
  9666. void llama_backend_init(void) {
  9667. ggml_time_init();
  9668. // needed to initialize f16 tables
  9669. {
  9670. struct ggml_init_params params = { 0, NULL, false };
  9671. struct ggml_context * ctx = ggml_init(params);
  9672. ggml_free(ctx);
  9673. }
  9674. }
  9675. void llama_numa_init(enum ggml_numa_strategy numa) {
  9676. if (numa != GGML_NUMA_STRATEGY_DISABLED) {
  9677. auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  9678. GGML_ASSERT(dev && "CPU backend is not loaded");
  9679. auto * reg = ggml_backend_dev_backend_reg(dev);
  9680. auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
  9681. numa_init_fn(numa);
  9682. }
  9683. }
  9684. void llama_backend_free(void) {
  9685. ggml_quantize_free();
  9686. }
  9687. int64_t llama_time_us(void) {
  9688. return ggml_time_us();
  9689. }
  9690. struct llama_model * llama_load_model_from_file(
  9691. const char * path_model,
  9692. struct llama_model_params params) {
  9693. ggml_time_init();
  9694. llama_model * model = new llama_model;
  9695. unsigned cur_percentage = 0;
  9696. if (params.progress_callback == NULL) {
  9697. params.progress_callback_user_data = &cur_percentage;
  9698. params.progress_callback = [](float progress, void * ctx) {
  9699. unsigned * cur_percentage_p = (unsigned *) ctx;
  9700. unsigned percentage = (unsigned) (100 * progress);
  9701. while (percentage > *cur_percentage_p) {
  9702. *cur_percentage_p = percentage;
  9703. LLAMA_LOG_CONT(".");
  9704. if (percentage >= 100) {
  9705. LLAMA_LOG_CONT("\n");
  9706. }
  9707. }
  9708. return true;
  9709. };
  9710. }
  9711. if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
  9712. // split the servers set them into model->rpc_servers
  9713. std::string servers(params.rpc_servers);
  9714. size_t pos = 0;
  9715. while ((pos = servers.find(',')) != std::string::npos) {
  9716. std::string server = servers.substr(0, pos);
  9717. model->rpc_servers.push_back(server);
  9718. servers.erase(0, pos + 1);
  9719. }
  9720. model->rpc_servers.push_back(servers);
  9721. }
  9722. // add RPC devices
  9723. if (!model->rpc_servers.empty()) {
  9724. ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
  9725. if (!rpc_reg) {
  9726. LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
  9727. llama_free_model(model);
  9728. return nullptr;
  9729. }
  9730. typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
  9731. ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
  9732. if (!ggml_backend_rpc_add_device_fn) {
  9733. LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
  9734. llama_free_model(model);
  9735. return nullptr;
  9736. }
  9737. for (const std::string & server : model->rpc_servers) {
  9738. ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
  9739. if (dev) {
  9740. model->devices.push_back(dev);
  9741. } else {
  9742. LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
  9743. llama_free_model(model);
  9744. return nullptr;
  9745. }
  9746. }
  9747. }
  9748. // create list of devices to use with this model
  9749. if (params.devices) {
  9750. for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
  9751. model->devices.push_back(*dev);
  9752. }
  9753. } else {
  9754. // use all available devices
  9755. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  9756. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  9757. switch (ggml_backend_dev_type(dev)) {
  9758. case GGML_BACKEND_DEVICE_TYPE_CPU:
  9759. case GGML_BACKEND_DEVICE_TYPE_ACCEL:
  9760. // skip CPU backends since they are handled separately
  9761. break;
  9762. case GGML_BACKEND_DEVICE_TYPE_GPU:
  9763. model->devices.push_back(dev);
  9764. break;
  9765. }
  9766. }
  9767. }
  9768. // if using single GPU mode, remove all except the main GPU
  9769. if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
  9770. if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
  9771. LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
  9772. llama_free_model(model);
  9773. return nullptr;
  9774. }
  9775. ggml_backend_dev_t main_gpu = model->devices[params.main_gpu];
  9776. model->devices.clear();
  9777. model->devices.push_back(main_gpu);
  9778. }
  9779. for (auto * dev : model->devices) {
  9780. size_t free, total; // NOLINT
  9781. ggml_backend_dev_memory(dev, &free, &total);
  9782. LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024);
  9783. }
  9784. int status = llama_model_load(path_model, *model, params);
  9785. GGML_ASSERT(status <= 0);
  9786. if (status < 0) {
  9787. if (status == -1) {
  9788. LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
  9789. } else if (status == -2) {
  9790. LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
  9791. }
  9792. llama_free_model(model);
  9793. return nullptr;
  9794. }
  9795. return model;
  9796. }
  9797. struct llama_context * llama_new_context_with_model(
  9798. struct llama_model * model,
  9799. struct llama_context_params params) {
  9800. if (!model) {
  9801. LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
  9802. return nullptr;
  9803. }
  9804. if (params.n_batch == 0 && params.n_ubatch == 0) {
  9805. LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
  9806. return nullptr;
  9807. }
  9808. if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
  9809. LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
  9810. return nullptr;
  9811. }
  9812. if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
  9813. LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
  9814. params.flash_attn = false;
  9815. }
  9816. if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
  9817. LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
  9818. params.flash_attn = false;
  9819. }
  9820. if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
  9821. LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
  9822. return nullptr;
  9823. }
  9824. llama_context * ctx = new llama_context(*model);
  9825. const auto & hparams = model->hparams;
  9826. auto & cparams = ctx->cparams;
  9827. cparams.n_seq_max = std::max(1u, params.n_seq_max);
  9828. cparams.n_threads = params.n_threads;
  9829. cparams.n_threads_batch = params.n_threads_batch;
  9830. cparams.yarn_ext_factor = params.yarn_ext_factor;
  9831. cparams.yarn_attn_factor = params.yarn_attn_factor;
  9832. cparams.yarn_beta_fast = params.yarn_beta_fast;
  9833. cparams.yarn_beta_slow = params.yarn_beta_slow;
  9834. cparams.defrag_thold = params.defrag_thold;
  9835. cparams.embeddings = params.embeddings;
  9836. cparams.offload_kqv = params.offload_kqv;
  9837. cparams.flash_attn = params.flash_attn;
  9838. cparams.no_perf = params.no_perf;
  9839. cparams.pooling_type = params.pooling_type;
  9840. cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
  9841. cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
  9842. cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
  9843. // this is necessary due to kv_self.n being padded later during inference
  9844. cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
  9845. // with causal attention, the batch size is limited by the context size
  9846. cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  9847. // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
  9848. // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
  9849. // ref: https://github.com/ggerganov/llama.cpp/pull/5021
  9850. if (cparams.n_batch < GGML_KQ_MASK_PAD) {
  9851. LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
  9852. cparams.n_batch = GGML_KQ_MASK_PAD;
  9853. }
  9854. cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
  9855. cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
  9856. hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
  9857. hparams.n_ctx_train;
  9858. cparams.cb_eval = params.cb_eval;
  9859. cparams.cb_eval_user_data = params.cb_eval_user_data;
  9860. auto rope_scaling_type = params.rope_scaling_type;
  9861. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
  9862. rope_scaling_type = hparams.rope_scaling_type_train;
  9863. }
  9864. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
  9865. cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
  9866. }
  9867. if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
  9868. cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
  9869. }
  9870. cparams.yarn_attn_factor *= hparams.rope_attn_factor;
  9871. if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  9872. if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  9873. cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
  9874. } else {
  9875. cparams.pooling_type = hparams.pooling_type;
  9876. }
  9877. }
  9878. if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
  9879. cparams.causal_attn = hparams.causal_attn;
  9880. } else {
  9881. cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
  9882. }
  9883. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  9884. LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
  9885. LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
  9886. LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
  9887. LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
  9888. LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
  9889. LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
  9890. LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
  9891. LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
  9892. if (n_ctx_per_seq < hparams.n_ctx_train) {
  9893. LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
  9894. __func__, n_ctx_per_seq, hparams.n_ctx_train);
  9895. }
  9896. if (n_ctx_per_seq > hparams.n_ctx_train) {
  9897. LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
  9898. __func__, n_ctx_per_seq, hparams.n_ctx_train);
  9899. }
  9900. ctx->logits_all = params.logits_all;
  9901. // build worst-case graph for encoder if a model contains encoder
  9902. ctx->is_encoding = llama_model_has_encoder(model);
  9903. uint32_t kv_size = cparams.n_ctx;
  9904. ggml_type type_k = params.type_k;
  9905. ggml_type type_v = params.type_v;
  9906. // Mamba only needs a constant number of KV cache cells per sequence
  9907. if (llama_model_is_recurrent(model)) {
  9908. // Mamba needs at least as many KV cells as there are sequences kept at any time
  9909. kv_size = std::max((uint32_t) 1, params.n_seq_max);
  9910. // it's probably best to keep as much precision as possible for the states
  9911. type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
  9912. type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
  9913. }
  9914. GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
  9915. GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
  9916. if (!hparams.vocab_only) {
  9917. // GPU backends
  9918. for (auto * dev : model->devices) {
  9919. ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
  9920. if (backend == nullptr) {
  9921. LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
  9922. llama_free(ctx);
  9923. return nullptr;
  9924. }
  9925. ctx->backends.emplace_back(backend);
  9926. }
  9927. // add ACCEL backends (such as BLAS)
  9928. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  9929. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  9930. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  9931. ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
  9932. if (backend == nullptr) {
  9933. LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
  9934. llama_free(ctx);
  9935. return nullptr;
  9936. }
  9937. ctx->backends.emplace_back(backend);
  9938. }
  9939. }
  9940. // add CPU backend
  9941. ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
  9942. if (ctx->backend_cpu == nullptr) {
  9943. LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
  9944. llama_free(ctx);
  9945. return nullptr;
  9946. }
  9947. ctx->backends.emplace_back(ctx->backend_cpu);
  9948. // create a list of the set_n_threads functions in the backends
  9949. for (auto & backend : ctx->backends) {
  9950. ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
  9951. ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
  9952. if (reg) {
  9953. auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
  9954. if (ggml_backend_set_n_threads_fn) {
  9955. ctx->set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
  9956. }
  9957. }
  9958. }
  9959. llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
  9960. if (!llama_kv_cache_init(ctx->kv_self, ctx->model, ctx->cparams, type_k, type_v, kv_size, cparams.offload_kqv)) {
  9961. LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
  9962. llama_free(ctx);
  9963. return nullptr;
  9964. }
  9965. {
  9966. size_t memory_size_k = 0;
  9967. size_t memory_size_v = 0;
  9968. for (auto & k : ctx->kv_self.k_l) {
  9969. memory_size_k += ggml_nbytes(k);
  9970. }
  9971. for (auto & v : ctx->kv_self.v_l) {
  9972. memory_size_v += ggml_nbytes(v);
  9973. }
  9974. LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
  9975. (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
  9976. ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
  9977. ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
  9978. }
  9979. // graph outputs buffer
  9980. {
  9981. // resized during inference when a batch uses more outputs
  9982. if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
  9983. LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
  9984. llama_free(ctx);
  9985. return nullptr;
  9986. }
  9987. LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
  9988. ggml_backend_buffer_name(ctx->buf_output.get()),
  9989. ggml_backend_buffer_get_size(ctx->buf_output.get()) / 1024.0 / 1024.0);
  9990. }
  9991. // scheduler and compute buffers
  9992. {
  9993. // buffer types used for the compute buffer of each backend
  9994. std::vector<ggml_backend_buffer_type_t> backend_buft;
  9995. std::vector<ggml_backend_t> backend_ptrs;
  9996. for (auto & backend : ctx->backends) {
  9997. auto * buft = ggml_backend_get_default_buffer_type(backend.get());
  9998. auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
  9999. if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
  10000. // use the host buffer of the first device CPU for faster transfer of the intermediate state
  10001. auto * dev = model->devices[0];
  10002. auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
  10003. if (host_buft) {
  10004. buft = host_buft;
  10005. }
  10006. }
  10007. backend_buft.push_back(buft);
  10008. backend_ptrs.push_back(backend.get());
  10009. }
  10010. const size_t max_nodes = llama_model_max_nodes(*model);
  10011. // buffer used to store the computation graph and the tensor meta data
  10012. ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
  10013. // TODO: move these checks to ggml_backend_sched
  10014. // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
  10015. bool pipeline_parallel =
  10016. llama_get_device_count(*model) > 1 &&
  10017. model->n_gpu_layers > (int)model->hparams.n_layer &&
  10018. model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
  10019. params.offload_kqv;
  10020. // pipeline parallelism requires support for async compute and events in all devices
  10021. if (pipeline_parallel) {
  10022. for (auto & backend : ctx->backends) {
  10023. auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
  10024. if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
  10025. // ignore CPU backend
  10026. continue;
  10027. }
  10028. auto * dev = ggml_backend_get_device(backend.get());
  10029. ggml_backend_dev_props props;
  10030. ggml_backend_dev_get_props(dev, &props);
  10031. if (!props.caps.async || !props.caps.events) {
  10032. // device does not support async compute or events
  10033. pipeline_parallel = false;
  10034. break;
  10035. }
  10036. }
  10037. }
  10038. ctx->sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
  10039. if (pipeline_parallel) {
  10040. LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched.get()));
  10041. }
  10042. // initialize scheduler with the worst-case graph
  10043. uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  10044. uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  10045. llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  10046. llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  10047. ggml_cgraph * gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
  10048. // reserve pp graph first so that buffers are only allocated once
  10049. ggml_backend_sched_reserve(ctx->sched.get(), gf_pp);
  10050. int n_splits_pp = ggml_backend_sched_get_n_splits(ctx->sched.get());
  10051. int n_nodes_pp = ggml_graph_n_nodes(gf_pp);
  10052. // reserve with tg graph to get the number of splits and nodes
  10053. llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  10054. ggml_cgraph * gf_tg = llama_build_graph(*ctx, ubatch_tg, true);
  10055. ggml_backend_sched_reserve(ctx->sched.get(), gf_tg);
  10056. int n_splits_tg = ggml_backend_sched_get_n_splits(ctx->sched.get());
  10057. int n_nodes_tg = ggml_graph_n_nodes(gf_tg);
  10058. // reserve again with pp graph to avoid ggml-alloc reallocations during inference
  10059. gf_pp = llama_build_graph(*ctx, ubatch_pp, true);
  10060. if (!ggml_backend_sched_reserve(ctx->sched.get(), gf_pp)) {
  10061. LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
  10062. llama_free(ctx);
  10063. return nullptr;
  10064. }
  10065. for (size_t i = 0; i < backend_ptrs.size(); ++i) {
  10066. ggml_backend_t backend = backend_ptrs[i];
  10067. ggml_backend_buffer_type_t buft = backend_buft[i];
  10068. size_t size = ggml_backend_sched_get_buffer_size(ctx->sched.get(), backend);
  10069. if (size > 1) {
  10070. LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
  10071. ggml_backend_buft_name(buft),
  10072. size / 1024.0 / 1024.0);
  10073. }
  10074. }
  10075. if (n_nodes_pp == n_nodes_tg) {
  10076. LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
  10077. } else {
  10078. LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
  10079. }
  10080. if (n_splits_pp == n_splits_tg) {
  10081. LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
  10082. } else {
  10083. LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
  10084. }
  10085. }
  10086. }
  10087. return ctx;
  10088. }
  10089. //
  10090. // kv cache
  10091. //
  10092. // TODO: tmp bridges below until `struct llama_kv_cache` is exposed through the public API
  10093. struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
  10094. return llama_kv_cache_view_init(ctx->kv_self, n_seq_max);
  10095. }
  10096. void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
  10097. llama_kv_cache_view_update(view, ctx->kv_self);
  10098. }
  10099. int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
  10100. return llama_get_kv_cache_token_count(ctx->kv_self);
  10101. }
  10102. int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
  10103. return llama_get_kv_cache_used_cells(ctx->kv_self);
  10104. }
  10105. void llama_kv_cache_clear(struct llama_context * ctx) {
  10106. llama_kv_cache_clear(ctx->kv_self);
  10107. }
  10108. bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
  10109. return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
  10110. }
  10111. void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
  10112. if (seq_id_src == seq_id_dst) {
  10113. return;
  10114. }
  10115. llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
  10116. }
  10117. void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
  10118. llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
  10119. }
  10120. void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
  10121. if (delta == 0) {
  10122. return;
  10123. }
  10124. llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
  10125. }
  10126. void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
  10127. if (d == 1) {
  10128. return;
  10129. }
  10130. llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
  10131. }
  10132. llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
  10133. return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
  10134. }
  10135. void llama_kv_cache_defrag(struct llama_context * ctx) {
  10136. llama_kv_cache_defrag(ctx->kv_self);
  10137. }
  10138. void llama_kv_cache_update(struct llama_context * ctx) {
  10139. llama_kv_cache_update_internal(*ctx);
  10140. }
  10141. bool llama_kv_cache_can_shift(struct llama_context * ctx) {
  10142. return llama_kv_cache_can_shift(ctx->kv_self);
  10143. }
  10144. ///
  10145. int32_t llama_encode(
  10146. struct llama_context * ctx,
  10147. struct llama_batch batch) {
  10148. const int ret = llama_encode_internal(*ctx, batch);
  10149. if (ret != 0) {
  10150. LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
  10151. }
  10152. return ret;
  10153. }
  10154. int32_t llama_decode(
  10155. struct llama_context * ctx,
  10156. struct llama_batch batch) {
  10157. const int ret = llama_decode_internal(*ctx, batch);
  10158. if (ret != 0) {
  10159. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  10160. }
  10161. return ret;
  10162. }
  10163. //
  10164. // vocab
  10165. //
  10166. // TODO: tmp bridges below until `struct llama_vocab` is exposed through the public API
  10167. const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
  10168. return llama_token_get_text_impl(model->vocab, token);
  10169. }
  10170. float llama_token_get_score(const struct llama_model * model, llama_token token) {
  10171. return llama_token_get_score_impl(model->vocab, token);
  10172. }
  10173. enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
  10174. return llama_token_get_attr_impl(model->vocab, token);
  10175. }
  10176. bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
  10177. return llama_token_is_eog_impl(model->vocab, token);
  10178. }
  10179. bool llama_token_is_control(const struct llama_model * model, llama_token token) {
  10180. return llama_token_is_control_impl(model->vocab, token);
  10181. }
  10182. llama_token llama_token_bos(const struct llama_model * model) {
  10183. return llama_token_bos_impl(model->vocab);
  10184. }
  10185. llama_token llama_token_eos(const struct llama_model * model) {
  10186. return llama_token_eos_impl(model->vocab);
  10187. }
  10188. llama_token llama_token_eot(const struct llama_model * model) {
  10189. return llama_token_eot_impl(model->vocab);
  10190. }
  10191. llama_token llama_token_cls(const struct llama_model * model) {
  10192. return llama_token_cls_impl(model->vocab);
  10193. }
  10194. llama_token llama_token_sep(const struct llama_model * model) {
  10195. return llama_token_sep_impl(model->vocab);
  10196. }
  10197. llama_token llama_token_nl (const struct llama_model * model) {
  10198. return llama_token_nl_impl(model->vocab);
  10199. }
  10200. llama_token llama_token_pad(const struct llama_model * model) {
  10201. return llama_token_pad_impl(model->vocab);
  10202. }
  10203. bool llama_add_bos_token(const struct llama_model * model) {
  10204. return llama_add_bos_token_impl(model->vocab);
  10205. }
  10206. bool llama_add_eos_token(const struct llama_model * model) {
  10207. return llama_add_eos_token_impl(model->vocab);
  10208. }
  10209. llama_token llama_token_prefix(const struct llama_model * model) {
  10210. return llama_token_prefix_impl(model->vocab);
  10211. }
  10212. llama_token llama_token_middle(const struct llama_model * model) {
  10213. return llama_token_middle_impl(model->vocab);
  10214. }
  10215. llama_token llama_token_suffix(const struct llama_model * model) {
  10216. return llama_token_suffix_impl(model->vocab);
  10217. }
  10218. llama_token llama_token_fim_pre(const struct llama_model * model) {
  10219. return llama_token_fim_pre_impl(model->vocab);
  10220. }
  10221. llama_token llama_token_fim_suf(const struct llama_model * model) {
  10222. return llama_token_fim_suf_impl(model->vocab);
  10223. }
  10224. llama_token llama_token_fim_mid(const struct llama_model * model) {
  10225. return llama_token_fim_mid_impl(model->vocab);
  10226. }
  10227. llama_token llama_token_fim_pad(const struct llama_model * model) {
  10228. return llama_token_fim_pad_impl(model->vocab);
  10229. }
  10230. llama_token llama_token_fim_rep(const struct llama_model * model) {
  10231. return llama_token_fim_rep_impl(model->vocab);
  10232. }
  10233. llama_token llama_token_fim_sep(const struct llama_model * model) {
  10234. return llama_token_fim_sep_impl(model->vocab);
  10235. }
  10236. //
  10237. // tokenization
  10238. //
  10239. int32_t llama_tokenize(
  10240. const struct llama_model * model,
  10241. const char * text,
  10242. int32_t text_len,
  10243. llama_token * tokens,
  10244. int32_t n_tokens_max,
  10245. bool add_special,
  10246. bool parse_special) {
  10247. return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
  10248. }
  10249. int32_t llama_token_to_piece(
  10250. const struct llama_model * model,
  10251. llama_token token,
  10252. char * buf,
  10253. int32_t length,
  10254. int32_t lstrip,
  10255. bool special) {
  10256. return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
  10257. }
  10258. int32_t llama_detokenize(
  10259. const struct llama_model * model,
  10260. const llama_token * tokens,
  10261. int32_t n_tokens,
  10262. char * text,
  10263. int32_t text_len_max,
  10264. bool remove_special,
  10265. bool unparse_special) {
  10266. return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
  10267. }
  10268. //
  10269. // chat templates
  10270. //
  10271. int32_t llama_chat_apply_template(
  10272. const struct llama_model * model,
  10273. const char * tmpl,
  10274. const struct llama_chat_message * chat,
  10275. size_t n_msg,
  10276. bool add_ass,
  10277. char * buf,
  10278. int32_t length) {
  10279. std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
  10280. if (tmpl == nullptr) {
  10281. GGML_ASSERT(model != nullptr);
  10282. // load template from model, if available
  10283. const auto & it = model->gguf_kv.find("tokenizer.chat_template");
  10284. if (it != model->gguf_kv.end() && it->second.size() > 0) {
  10285. curr_tmpl = it->second;
  10286. }
  10287. else {
  10288. // worst case: there is no information about template, we will use chatml by default
  10289. curr_tmpl = "chatml"; // see llm_chat_apply_template
  10290. }
  10291. }
  10292. // format the chat to string
  10293. std::vector<const llama_chat_message *> chat_vec;
  10294. chat_vec.resize(n_msg);
  10295. for (size_t i = 0; i < n_msg; i++) {
  10296. chat_vec[i] = &chat[i];
  10297. }
  10298. std::string formatted_chat;
  10299. llm_chat_template detected_tmpl = llm_chat_detect_template(curr_tmpl);
  10300. if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
  10301. return -1;
  10302. }
  10303. int32_t res = llm_chat_apply_template(detected_tmpl, chat_vec, formatted_chat, add_ass);
  10304. if (res < 0) {
  10305. return res;
  10306. }
  10307. if (buf && length > 0) {
  10308. strncpy(buf, formatted_chat.c_str(), length);
  10309. }
  10310. return res;
  10311. }
  10312. //
  10313. // sampling
  10314. //
  10315. // TODO: remove indirection when vocab becomes accesible in llama-sampling.cpp
  10316. struct llama_sampler * llama_sampler_init_grammar(const struct llama_model * model, const char * grammar_str, const char * grammar_root) {
  10317. return llama_sampler_init_grammar_impl(model->vocab, grammar_str, grammar_root);
  10318. }
  10319. struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model) {
  10320. return llama_sampler_init_infill_impl(model->vocab);
  10321. }
  10322. struct llama_sampler * llama_sampler_init_dry(const struct llama_model * model, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
  10323. return llama_sampler_init_dry_impl(model->vocab, llama_n_ctx_train(model), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
  10324. }
  10325. //
  10326. // model split
  10327. //
  10328. int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
  10329. static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
  10330. if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
  10331. return strlen(split_path);
  10332. }
  10333. return 0;
  10334. }
  10335. int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
  10336. std::string str_split_path(split_path);
  10337. char postfix[32];
  10338. snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
  10339. std::string str_postfix(postfix);
  10340. // check if dest ends with postfix
  10341. int size_prefix = str_split_path.size() - str_postfix.size();
  10342. if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
  10343. snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
  10344. return size_prefix;
  10345. }
  10346. return 0;
  10347. }
  10348. const char * llama_print_system_info(void) {
  10349. static std::string s;
  10350. for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
  10351. auto * reg = ggml_backend_reg_get(i);
  10352. auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
  10353. if (get_features_fn) {
  10354. ggml_backend_feature * features = get_features_fn(reg);
  10355. s += ggml_backend_reg_name(reg);
  10356. s += " : ";
  10357. for (; features->name; features++) {
  10358. s += features->name;
  10359. s += " = ";
  10360. s += features->value;
  10361. s += " | ";
  10362. }
  10363. }
  10364. }
  10365. return s.c_str();
  10366. }
  10367. //
  10368. // perf
  10369. //
  10370. struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
  10371. struct llama_perf_context_data data = {};
  10372. if (ctx == nullptr) {
  10373. return data;
  10374. }
  10375. data.t_start_ms = 1e-3 * ctx->t_start_us;
  10376. data.t_load_ms = 1e-3 * ctx->t_load_us;
  10377. data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
  10378. data.t_eval_ms = 1e-3 * ctx->t_eval_us;
  10379. data.n_p_eval = std::max(1, ctx->n_p_eval);
  10380. data.n_eval = std::max(1, ctx->n_eval);
  10381. return data;
  10382. }
  10383. void llama_perf_context_print(const struct llama_context * ctx) {
  10384. const auto data = llama_perf_context(ctx);
  10385. const double t_end_ms = 1e-3 * ggml_time_us();
  10386. LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
  10387. LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
  10388. __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
  10389. LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  10390. __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
  10391. LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
  10392. }
  10393. void llama_perf_context_reset(struct llama_context * ctx) {
  10394. ctx->t_start_us = ggml_time_us();
  10395. ctx->t_eval_us = ctx->n_eval = 0;
  10396. ctx->t_p_eval_us = ctx->n_p_eval = 0;
  10397. }