llama-arch.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. /**
  2. * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #pragma once
  27. #include "ggml.h" // ggml_op
  28. #include <string>
  29. //
  30. // gguf constants (sync with gguf.py)
  31. //
  32. enum llm_arch {
  33. LLM_ARCH_LLAMA,
  34. LLM_ARCH_MLLAMA,
  35. LLM_ARCH_DECI,
  36. LLM_ARCH_FALCON,
  37. LLM_ARCH_BAICHUAN,
  38. LLM_ARCH_GROK,
  39. LLM_ARCH_GPT2,
  40. LLM_ARCH_GPTJ,
  41. LLM_ARCH_GPTNEOX,
  42. LLM_ARCH_MPT,
  43. LLM_ARCH_STARCODER,
  44. LLM_ARCH_REFACT,
  45. LLM_ARCH_BERT,
  46. LLM_ARCH_NOMIC_BERT,
  47. LLM_ARCH_JINA_BERT_V2,
  48. LLM_ARCH_BLOOM,
  49. LLM_ARCH_STABLELM,
  50. LLM_ARCH_QWEN,
  51. LLM_ARCH_QWEN2,
  52. LLM_ARCH_QWEN2MOE,
  53. LLM_ARCH_QWEN2VL,
  54. LLM_ARCH_PHI2,
  55. LLM_ARCH_PHI3,
  56. LLM_ARCH_PLAMO,
  57. LLM_ARCH_CODESHELL,
  58. LLM_ARCH_ORION,
  59. LLM_ARCH_INTERNLM2,
  60. LLM_ARCH_MINICPM,
  61. LLM_ARCH_MINICPM3,
  62. LLM_ARCH_GEMMA,
  63. LLM_ARCH_GEMMA2,
  64. LLM_ARCH_STARCODER2,
  65. LLM_ARCH_MAMBA,
  66. LLM_ARCH_XVERSE,
  67. LLM_ARCH_COMMAND_R,
  68. LLM_ARCH_COHERE2,
  69. LLM_ARCH_DBRX,
  70. LLM_ARCH_OLMO,
  71. LLM_ARCH_OLMO2,
  72. LLM_ARCH_OLMOE,
  73. LLM_ARCH_OPENELM,
  74. LLM_ARCH_ARCTIC,
  75. LLM_ARCH_DEEPSEEK,
  76. LLM_ARCH_DEEPSEEK2,
  77. LLM_ARCH_CHATGLM,
  78. LLM_ARCH_BITNET,
  79. LLM_ARCH_T5,
  80. LLM_ARCH_T5ENCODER,
  81. LLM_ARCH_JAIS,
  82. LLM_ARCH_NEMOTRON,
  83. LLM_ARCH_EXAONE,
  84. LLM_ARCH_RWKV6,
  85. LLM_ARCH_GRANITE,
  86. LLM_ARCH_GRANITE_MOE,
  87. LLM_ARCH_CHAMELEON,
  88. LLM_ARCH_SOLAR,
  89. LLM_ARCH_WAVTOKENIZER_DEC,
  90. LLM_ARCH_UNKNOWN,
  91. };
  92. enum llm_kv {
  93. LLM_KV_GENERAL_TYPE,
  94. LLM_KV_GENERAL_ARCHITECTURE,
  95. LLM_KV_GENERAL_QUANTIZATION_VERSION,
  96. LLM_KV_GENERAL_ALIGNMENT,
  97. LLM_KV_GENERAL_NAME,
  98. LLM_KV_GENERAL_AUTHOR,
  99. LLM_KV_GENERAL_VERSION,
  100. LLM_KV_GENERAL_URL,
  101. LLM_KV_GENERAL_DESCRIPTION,
  102. LLM_KV_GENERAL_LICENSE,
  103. LLM_KV_GENERAL_SOURCE_URL,
  104. LLM_KV_GENERAL_SOURCE_HF_REPO,
  105. LLM_KV_VOCAB_SIZE,
  106. LLM_KV_CONTEXT_LENGTH,
  107. LLM_KV_EMBEDDING_LENGTH,
  108. LLM_KV_FEATURES_LENGTH,
  109. LLM_KV_BLOCK_COUNT,
  110. LLM_KV_LEADING_DENSE_BLOCK_COUNT,
  111. LLM_KV_FEED_FORWARD_LENGTH,
  112. LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
  113. LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
  114. LLM_KV_USE_PARALLEL_RESIDUAL,
  115. LLM_KV_TENSOR_DATA_LAYOUT,
  116. LLM_KV_EXPERT_COUNT,
  117. LLM_KV_EXPERT_USED_COUNT,
  118. LLM_KV_EXPERT_SHARED_COUNT,
  119. LLM_KV_EXPERT_WEIGHTS_SCALE,
  120. LLM_KV_EXPERT_WEIGHTS_NORM,
  121. LLM_KV_EXPERT_GATING_FUNC,
  122. LLM_KV_POOLING_TYPE,
  123. LLM_KV_LOGIT_SCALE,
  124. LLM_KV_DECODER_START_TOKEN_ID,
  125. LLM_KV_ATTN_LOGIT_SOFTCAPPING,
  126. LLM_KV_FINAL_LOGIT_SOFTCAPPING,
  127. LLM_KV_SWIN_NORM,
  128. LLM_KV_RESCALE_EVERY_N_LAYERS,
  129. LLM_KV_TIME_MIX_EXTRA_DIM,
  130. LLM_KV_TIME_DECAY_EXTRA_DIM,
  131. LLM_KV_RESIDUAL_SCALE,
  132. LLM_KV_EMBEDDING_SCALE,
  133. LLM_KV_ATTENTION_HEAD_COUNT,
  134. LLM_KV_ATTENTION_HEAD_COUNT_KV,
  135. LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
  136. LLM_KV_ATTENTION_CLAMP_KQV,
  137. LLM_KV_ATTENTION_KEY_LENGTH,
  138. LLM_KV_ATTENTION_VALUE_LENGTH,
  139. LLM_KV_ATTENTION_LAYERNORM_EPS,
  140. LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  141. LLM_KV_ATTENTION_GROUPNORM_EPS,
  142. LLM_KV_ATTENTION_GROUPNORM_GROUPS,
  143. LLM_KV_ATTENTION_CAUSAL,
  144. LLM_KV_ATTENTION_Q_LORA_RANK,
  145. LLM_KV_ATTENTION_KV_LORA_RANK,
  146. LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
  147. LLM_KV_ATTENTION_SLIDING_WINDOW,
  148. LLM_KV_ATTENTION_SCALE,
  149. LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
  150. LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
  151. LLM_KV_ROPE_DIMENSION_COUNT,
  152. LLM_KV_ROPE_DIMENSION_SECTIONS,
  153. LLM_KV_ROPE_FREQ_BASE,
  154. LLM_KV_ROPE_SCALE_LINEAR,
  155. LLM_KV_ROPE_SCALING_TYPE,
  156. LLM_KV_ROPE_SCALING_FACTOR,
  157. LLM_KV_ROPE_SCALING_ATTN_FACTOR,
  158. LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
  159. LLM_KV_ROPE_SCALING_FINETUNED,
  160. LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
  161. LLM_KV_SPLIT_NO,
  162. LLM_KV_SPLIT_COUNT,
  163. LLM_KV_SPLIT_TENSORS_COUNT,
  164. LLM_KV_SSM_INNER_SIZE,
  165. LLM_KV_SSM_CONV_KERNEL,
  166. LLM_KV_SSM_STATE_SIZE,
  167. LLM_KV_SSM_TIME_STEP_RANK,
  168. LLM_KV_SSM_DT_B_C_RMS,
  169. LLM_KV_WKV_HEAD_SIZE,
  170. LLM_KV_TOKENIZER_MODEL,
  171. LLM_KV_TOKENIZER_PRE,
  172. LLM_KV_TOKENIZER_LIST,
  173. LLM_KV_TOKENIZER_TOKEN_TYPE,
  174. LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
  175. LLM_KV_TOKENIZER_SCORES,
  176. LLM_KV_TOKENIZER_MERGES,
  177. LLM_KV_TOKENIZER_BOS_ID,
  178. LLM_KV_TOKENIZER_EOS_ID,
  179. LLM_KV_TOKENIZER_EOT_ID,
  180. LLM_KV_TOKENIZER_EOM_ID,
  181. LLM_KV_TOKENIZER_UNK_ID,
  182. LLM_KV_TOKENIZER_SEP_ID,
  183. LLM_KV_TOKENIZER_PAD_ID,
  184. LLM_KV_TOKENIZER_CLS_ID,
  185. LLM_KV_TOKENIZER_MASK_ID,
  186. LLM_KV_TOKENIZER_ADD_BOS,
  187. LLM_KV_TOKENIZER_ADD_EOS,
  188. LLM_KV_TOKENIZER_ADD_PREFIX,
  189. LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
  190. LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
  191. LLM_KV_TOKENIZER_HF_JSON,
  192. LLM_KV_TOKENIZER_RWKV,
  193. LLM_KV_TOKENIZER_FIM_PRE_ID,
  194. LLM_KV_TOKENIZER_FIM_SUF_ID,
  195. LLM_KV_TOKENIZER_FIM_MID_ID,
  196. LLM_KV_TOKENIZER_FIM_PAD_ID,
  197. LLM_KV_TOKENIZER_FIM_REP_ID,
  198. LLM_KV_TOKENIZER_FIM_SEP_ID,
  199. LLM_KV_ADAPTER_TYPE,
  200. LLM_KV_ADAPTER_LORA_ALPHA,
  201. LLM_KV_POSNET_EMBEDDING_LENGTH,
  202. LLM_KV_POSNET_BLOCK_COUNT,
  203. LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
  204. LLM_KV_CONVNEXT_BLOCK_COUNT,
  205. // deprecated:
  206. LLM_KV_TOKENIZER_PREFIX_ID,
  207. LLM_KV_TOKENIZER_SUFFIX_ID,
  208. LLM_KV_TOKENIZER_MIDDLE_ID,
  209. };
  210. enum llm_tensor {
  211. LLM_TENSOR_TOKEN_EMBD,
  212. LLM_TENSOR_TOKEN_EMBD_NORM,
  213. LLM_TENSOR_TOKEN_TYPES,
  214. LLM_TENSOR_POS_EMBD,
  215. LLM_TENSOR_OUTPUT,
  216. LLM_TENSOR_OUTPUT_NORM,
  217. LLM_TENSOR_ROPE_FREQS,
  218. LLM_TENSOR_ROPE_FACTORS_LONG,
  219. LLM_TENSOR_ROPE_FACTORS_SHORT,
  220. LLM_TENSOR_ATTN_Q,
  221. LLM_TENSOR_ATTN_K,
  222. LLM_TENSOR_ATTN_V,
  223. LLM_TENSOR_ATTN_QKV,
  224. LLM_TENSOR_ATTN_OUT,
  225. LLM_TENSOR_ATTN_NORM,
  226. LLM_TENSOR_ATTN_NORM_2,
  227. LLM_TENSOR_ATTN_OUT_NORM,
  228. LLM_TENSOR_ATTN_POST_NORM,
  229. LLM_TENSOR_ATTN_ROT_EMBD,
  230. LLM_TENSOR_FFN_GATE_INP,
  231. LLM_TENSOR_FFN_GATE_INP_SHEXP,
  232. LLM_TENSOR_FFN_NORM,
  233. LLM_TENSOR_FFN_POST_NORM,
  234. LLM_TENSOR_FFN_GATE,
  235. LLM_TENSOR_FFN_DOWN,
  236. LLM_TENSOR_FFN_UP,
  237. LLM_TENSOR_FFN_ACT,
  238. LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
  239. LLM_TENSOR_FFN_GATE_EXP,
  240. LLM_TENSOR_FFN_UP_EXP,
  241. LLM_TENSOR_FFN_NORM_EXPS,
  242. LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
  243. LLM_TENSOR_FFN_GATE_EXPS,
  244. LLM_TENSOR_FFN_UP_EXPS,
  245. LLM_TENSOR_FFN_DOWN_SHEXP,
  246. LLM_TENSOR_FFN_GATE_SHEXP,
  247. LLM_TENSOR_FFN_UP_SHEXP,
  248. LLM_TENSOR_FFN_EXP_PROBS_B,
  249. LLM_TENSOR_ATTN_Q_NORM,
  250. LLM_TENSOR_ATTN_K_NORM,
  251. LLM_TENSOR_LAYER_OUT_NORM,
  252. LLM_TENSOR_SSM_IN,
  253. LLM_TENSOR_SSM_CONV1D,
  254. LLM_TENSOR_SSM_X,
  255. LLM_TENSOR_SSM_DT,
  256. LLM_TENSOR_SSM_A,
  257. LLM_TENSOR_SSM_D,
  258. LLM_TENSOR_SSM_OUT,
  259. LLM_TENSOR_TIME_MIX_W1,
  260. LLM_TENSOR_TIME_MIX_W2,
  261. LLM_TENSOR_TIME_MIX_LERP_X,
  262. LLM_TENSOR_TIME_MIX_LERP_W,
  263. LLM_TENSOR_TIME_MIX_LERP_K,
  264. LLM_TENSOR_TIME_MIX_LERP_V,
  265. LLM_TENSOR_TIME_MIX_LERP_R,
  266. LLM_TENSOR_TIME_MIX_LERP_G,
  267. LLM_TENSOR_TIME_MIX_FIRST,
  268. LLM_TENSOR_TIME_MIX_DECAY,
  269. LLM_TENSOR_TIME_MIX_DECAY_W1,
  270. LLM_TENSOR_TIME_MIX_DECAY_W2,
  271. LLM_TENSOR_TIME_MIX_KEY,
  272. LLM_TENSOR_TIME_MIX_VALUE,
  273. LLM_TENSOR_TIME_MIX_RECEPTANCE,
  274. LLM_TENSOR_TIME_MIX_GATE,
  275. LLM_TENSOR_TIME_MIX_LN,
  276. LLM_TENSOR_TIME_MIX_OUTPUT,
  277. LLM_TENSOR_CHANNEL_MIX_LERP_K,
  278. LLM_TENSOR_CHANNEL_MIX_LERP_R,
  279. LLM_TENSOR_CHANNEL_MIX_KEY,
  280. LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
  281. LLM_TENSOR_CHANNEL_MIX_VALUE,
  282. LLM_TENSOR_ATTN_Q_A,
  283. LLM_TENSOR_ATTN_Q_B,
  284. LLM_TENSOR_ATTN_KV_A_MQA,
  285. LLM_TENSOR_ATTN_KV_B,
  286. LLM_TENSOR_ATTN_Q_A_NORM,
  287. LLM_TENSOR_ATTN_KV_A_NORM,
  288. LLM_TENSOR_ATTN_SUB_NORM,
  289. LLM_TENSOR_FFN_SUB_NORM,
  290. LLM_TENSOR_DEC_ATTN_NORM,
  291. LLM_TENSOR_DEC_ATTN_Q,
  292. LLM_TENSOR_DEC_ATTN_K,
  293. LLM_TENSOR_DEC_ATTN_V,
  294. LLM_TENSOR_DEC_ATTN_OUT,
  295. LLM_TENSOR_DEC_ATTN_REL_B,
  296. LLM_TENSOR_DEC_CROSS_ATTN_NORM,
  297. LLM_TENSOR_DEC_CROSS_ATTN_Q,
  298. LLM_TENSOR_DEC_CROSS_ATTN_K,
  299. LLM_TENSOR_DEC_CROSS_ATTN_V,
  300. LLM_TENSOR_DEC_CROSS_ATTN_OUT,
  301. LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
  302. LLM_TENSOR_DEC_FFN_NORM,
  303. LLM_TENSOR_DEC_FFN_GATE,
  304. LLM_TENSOR_DEC_FFN_DOWN,
  305. LLM_TENSOR_DEC_FFN_UP,
  306. LLM_TENSOR_DEC_OUTPUT_NORM,
  307. LLM_TENSOR_ENC_ATTN_NORM,
  308. LLM_TENSOR_ENC_ATTN_Q,
  309. LLM_TENSOR_ENC_ATTN_K,
  310. LLM_TENSOR_ENC_ATTN_V,
  311. LLM_TENSOR_ENC_ATTN_OUT,
  312. LLM_TENSOR_ENC_ATTN_REL_B,
  313. LLM_TENSOR_ENC_FFN_NORM,
  314. LLM_TENSOR_ENC_FFN_GATE,
  315. LLM_TENSOR_ENC_FFN_DOWN,
  316. LLM_TENSOR_ENC_FFN_UP,
  317. LLM_TENSOR_ENC_OUTPUT_NORM,
  318. LLM_TENSOR_CLS,
  319. LLM_TENSOR_CLS_OUT,
  320. LLM_TENSOR_BSKCN_TV,
  321. LLM_TENSOR_CROSS_ATTN_K_NORM,
  322. LLM_TENSOR_CROSS_ATTN_K_PROJ,
  323. LLM_TENSOR_CROSS_ATTN_O_PROJ,
  324. LLM_TENSOR_CROSS_ATTN_Q_NORM,
  325. LLM_TENSOR_CROSS_ATTN_Q_PROJ,
  326. LLM_TENSOR_CROSS_ATTN_V_PROJ,
  327. LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
  328. LLM_TENSOR_CROSS_ATTN_MLP_GATE,
  329. LLM_TENSOR_CONV1D,
  330. LLM_TENSOR_CONVNEXT_DW,
  331. LLM_TENSOR_CONVNEXT_NORM,
  332. LLM_TENSOR_CONVNEXT_PW1,
  333. LLM_TENSOR_CONVNEXT_PW2,
  334. LLM_TENSOR_CONVNEXT_GAMMA,
  335. LLM_TENSOR_POS_NET_CONV1,
  336. LLM_TENSOR_POS_NET_CONV2,
  337. LLM_TENSOR_POS_NET_NORM,
  338. LLM_TENSOR_POS_NET_NORM1,
  339. LLM_TENSOR_POS_NET_NORM2,
  340. LLM_TENSOR_POS_NET_ATTN_NORM,
  341. LLM_TENSOR_POS_NET_ATTN_Q,
  342. LLM_TENSOR_POS_NET_ATTN_K,
  343. LLM_TENSOR_POS_NET_ATTN_V,
  344. LLM_TENSOR_POS_NET_ATTN_OUT,
  345. };
  346. enum llm_tensor_layer {
  347. LLM_TENSOR_LAYER_INPUT,
  348. LLM_TENSOR_LAYER_REPEATING,
  349. LLM_TENSOR_LAYER_OUTPUT,
  350. };
  351. struct LLM_KV {
  352. LLM_KV(llm_arch arch);
  353. llm_arch arch;
  354. std::string operator()(llm_kv kv) const;
  355. };
  356. // helper to handle gguf constants
  357. // usage:
  358. //
  359. // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
  360. //
  361. // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
  362. // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
  363. // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
  364. //
  365. struct LLM_TN_IMPL {
  366. const llm_arch arch;
  367. const llm_tensor tensor;
  368. const char * const suffix;
  369. const int bid;
  370. const int xid;
  371. std::string str() const;
  372. operator std::string() const {
  373. return str();
  374. }
  375. friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
  376. return str == tn.str();
  377. }
  378. friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
  379. return str != tn.str();
  380. }
  381. };
  382. struct LLM_TN {
  383. LLM_TN(llm_arch arch) : arch(arch) {}
  384. llm_arch arch;
  385. LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
  386. return { arch, tensor, suffix, bid, xid };
  387. }
  388. LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
  389. return { arch, tensor, nullptr, bid, xid };
  390. }
  391. };
  392. struct llm_tensor_info {
  393. llm_tensor_layer layer;
  394. ggml_op op;
  395. };
  396. const char * llm_arch_name(llm_arch arch);
  397. llm_arch llm_arch_from_string(const std::string & name);
  398. const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);