llama-arch.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. #pragma once
  2. #include "ggml.h" // ggml_op
  3. #include <string>
  4. //
  5. // gguf constants (sync with gguf.py)
  6. //
  7. enum llm_arch {
  8. LLM_ARCH_LLAMA,
  9. LLM_ARCH_MLLAMA,
  10. LLM_ARCH_DECI,
  11. LLM_ARCH_FALCON,
  12. LLM_ARCH_BAICHUAN,
  13. LLM_ARCH_GROK,
  14. LLM_ARCH_GPT2,
  15. LLM_ARCH_GPTJ,
  16. LLM_ARCH_GPTNEOX,
  17. LLM_ARCH_MPT,
  18. LLM_ARCH_STARCODER,
  19. LLM_ARCH_REFACT,
  20. LLM_ARCH_BERT,
  21. LLM_ARCH_NOMIC_BERT,
  22. LLM_ARCH_JINA_BERT_V2,
  23. LLM_ARCH_BLOOM,
  24. LLM_ARCH_STABLELM,
  25. LLM_ARCH_QWEN,
  26. LLM_ARCH_QWEN2,
  27. LLM_ARCH_QWEN2MOE,
  28. LLM_ARCH_QWEN2VL,
  29. LLM_ARCH_PHI2,
  30. LLM_ARCH_PHI3,
  31. LLM_ARCH_PHIMOE,
  32. LLM_ARCH_PLAMO,
  33. LLM_ARCH_CODESHELL,
  34. LLM_ARCH_ORION,
  35. LLM_ARCH_INTERNLM2,
  36. LLM_ARCH_MINICPM,
  37. LLM_ARCH_MINICPM3,
  38. LLM_ARCH_GEMMA,
  39. LLM_ARCH_GEMMA2,
  40. LLM_ARCH_STARCODER2,
  41. LLM_ARCH_MAMBA,
  42. LLM_ARCH_XVERSE,
  43. LLM_ARCH_COMMAND_R,
  44. LLM_ARCH_COHERE2,
  45. LLM_ARCH_DBRX,
  46. LLM_ARCH_OLMO,
  47. LLM_ARCH_OLMO2,
  48. LLM_ARCH_OLMOE,
  49. LLM_ARCH_OPENELM,
  50. LLM_ARCH_ARCTIC,
  51. LLM_ARCH_DEEPSEEK,
  52. LLM_ARCH_DEEPSEEK2,
  53. LLM_ARCH_CHATGLM,
  54. LLM_ARCH_BITNET,
  55. LLM_ARCH_T5,
  56. LLM_ARCH_T5ENCODER,
  57. LLM_ARCH_JAIS,
  58. LLM_ARCH_NEMOTRON,
  59. LLM_ARCH_EXAONE,
  60. LLM_ARCH_RWKV6,
  61. LLM_ARCH_RWKV6QWEN2,
  62. LLM_ARCH_GRANITE,
  63. LLM_ARCH_GRANITE_MOE,
  64. LLM_ARCH_CHAMELEON,
  65. LLM_ARCH_SOLAR,
  66. LLM_ARCH_WAVTOKENIZER_DEC,
  67. LLM_ARCH_UNKNOWN,
  68. };
  69. enum llm_kv {
  70. LLM_KV_GENERAL_TYPE,
  71. LLM_KV_GENERAL_ARCHITECTURE,
  72. LLM_KV_GENERAL_QUANTIZATION_VERSION,
  73. LLM_KV_GENERAL_ALIGNMENT,
  74. LLM_KV_GENERAL_NAME,
  75. LLM_KV_GENERAL_AUTHOR,
  76. LLM_KV_GENERAL_VERSION,
  77. LLM_KV_GENERAL_URL,
  78. LLM_KV_GENERAL_DESCRIPTION,
  79. LLM_KV_GENERAL_LICENSE,
  80. LLM_KV_GENERAL_SOURCE_URL,
  81. LLM_KV_GENERAL_SOURCE_HF_REPO,
  82. LLM_KV_VOCAB_SIZE,
  83. LLM_KV_CONTEXT_LENGTH,
  84. LLM_KV_EMBEDDING_LENGTH,
  85. LLM_KV_FEATURES_LENGTH,
  86. LLM_KV_BLOCK_COUNT,
  87. LLM_KV_LEADING_DENSE_BLOCK_COUNT,
  88. LLM_KV_FEED_FORWARD_LENGTH,
  89. LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
  90. LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
  91. LLM_KV_USE_PARALLEL_RESIDUAL,
  92. LLM_KV_TENSOR_DATA_LAYOUT,
  93. LLM_KV_EXPERT_COUNT,
  94. LLM_KV_EXPERT_USED_COUNT,
  95. LLM_KV_EXPERT_SHARED_COUNT,
  96. LLM_KV_EXPERT_WEIGHTS_SCALE,
  97. LLM_KV_EXPERT_WEIGHTS_NORM,
  98. LLM_KV_EXPERT_GATING_FUNC,
  99. LLM_KV_POOLING_TYPE,
  100. LLM_KV_LOGIT_SCALE,
  101. LLM_KV_DECODER_START_TOKEN_ID,
  102. LLM_KV_ATTN_LOGIT_SOFTCAPPING,
  103. LLM_KV_FINAL_LOGIT_SOFTCAPPING,
  104. LLM_KV_SWIN_NORM,
  105. LLM_KV_RESCALE_EVERY_N_LAYERS,
  106. LLM_KV_TIME_MIX_EXTRA_DIM,
  107. LLM_KV_TIME_DECAY_EXTRA_DIM,
  108. LLM_KV_RESIDUAL_SCALE,
  109. LLM_KV_EMBEDDING_SCALE,
  110. LLM_KV_TOKEN_SHIFT_COUNT,
  111. LLM_KV_ATTENTION_HEAD_COUNT,
  112. LLM_KV_ATTENTION_HEAD_COUNT_KV,
  113. LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
  114. LLM_KV_ATTENTION_CLAMP_KQV,
  115. LLM_KV_ATTENTION_KEY_LENGTH,
  116. LLM_KV_ATTENTION_VALUE_LENGTH,
  117. LLM_KV_ATTENTION_LAYERNORM_EPS,
  118. LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  119. LLM_KV_ATTENTION_GROUPNORM_EPS,
  120. LLM_KV_ATTENTION_GROUPNORM_GROUPS,
  121. LLM_KV_ATTENTION_CAUSAL,
  122. LLM_KV_ATTENTION_Q_LORA_RANK,
  123. LLM_KV_ATTENTION_KV_LORA_RANK,
  124. LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
  125. LLM_KV_ATTENTION_SLIDING_WINDOW,
  126. LLM_KV_ATTENTION_SCALE,
  127. LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
  128. LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS,
  129. LLM_KV_ROPE_DIMENSION_COUNT,
  130. LLM_KV_ROPE_DIMENSION_SECTIONS,
  131. LLM_KV_ROPE_FREQ_BASE,
  132. LLM_KV_ROPE_SCALE_LINEAR,
  133. LLM_KV_ROPE_SCALING_TYPE,
  134. LLM_KV_ROPE_SCALING_FACTOR,
  135. LLM_KV_ROPE_SCALING_ATTN_FACTOR,
  136. LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
  137. LLM_KV_ROPE_SCALING_FINETUNED,
  138. LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
  139. LLM_KV_SPLIT_NO,
  140. LLM_KV_SPLIT_COUNT,
  141. LLM_KV_SPLIT_TENSORS_COUNT,
  142. LLM_KV_SSM_INNER_SIZE,
  143. LLM_KV_SSM_CONV_KERNEL,
  144. LLM_KV_SSM_STATE_SIZE,
  145. LLM_KV_SSM_TIME_STEP_RANK,
  146. LLM_KV_SSM_DT_B_C_RMS,
  147. LLM_KV_WKV_HEAD_SIZE,
  148. LLM_KV_TOKENIZER_MODEL,
  149. LLM_KV_TOKENIZER_PRE,
  150. LLM_KV_TOKENIZER_LIST,
  151. LLM_KV_TOKENIZER_TOKEN_TYPE,
  152. LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
  153. LLM_KV_TOKENIZER_SCORES,
  154. LLM_KV_TOKENIZER_MERGES,
  155. LLM_KV_TOKENIZER_BOS_ID,
  156. LLM_KV_TOKENIZER_EOS_ID,
  157. LLM_KV_TOKENIZER_EOT_ID,
  158. LLM_KV_TOKENIZER_EOM_ID,
  159. LLM_KV_TOKENIZER_UNK_ID,
  160. LLM_KV_TOKENIZER_SEP_ID,
  161. LLM_KV_TOKENIZER_PAD_ID,
  162. LLM_KV_TOKENIZER_CLS_ID,
  163. LLM_KV_TOKENIZER_MASK_ID,
  164. LLM_KV_TOKENIZER_ADD_BOS,
  165. LLM_KV_TOKENIZER_ADD_EOS,
  166. LLM_KV_TOKENIZER_ADD_PREFIX,
  167. LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
  168. LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
  169. LLM_KV_TOKENIZER_HF_JSON,
  170. LLM_KV_TOKENIZER_RWKV,
  171. LLM_KV_TOKENIZER_CHAT_TEMPLATE,
  172. LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
  173. LLM_KV_TOKENIZER_FIM_PRE_ID,
  174. LLM_KV_TOKENIZER_FIM_SUF_ID,
  175. LLM_KV_TOKENIZER_FIM_MID_ID,
  176. LLM_KV_TOKENIZER_FIM_PAD_ID,
  177. LLM_KV_TOKENIZER_FIM_REP_ID,
  178. LLM_KV_TOKENIZER_FIM_SEP_ID,
  179. LLM_KV_ADAPTER_TYPE,
  180. LLM_KV_ADAPTER_LORA_ALPHA,
  181. LLM_KV_POSNET_EMBEDDING_LENGTH,
  182. LLM_KV_POSNET_BLOCK_COUNT,
  183. LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
  184. LLM_KV_CONVNEXT_BLOCK_COUNT,
  185. // deprecated:
  186. LLM_KV_TOKENIZER_PREFIX_ID,
  187. LLM_KV_TOKENIZER_SUFFIX_ID,
  188. LLM_KV_TOKENIZER_MIDDLE_ID,
  189. };
  190. enum llm_tensor {
  191. LLM_TENSOR_TOKEN_EMBD,
  192. LLM_TENSOR_TOKEN_EMBD_NORM,
  193. LLM_TENSOR_TOKEN_TYPES,
  194. LLM_TENSOR_POS_EMBD,
  195. LLM_TENSOR_OUTPUT,
  196. LLM_TENSOR_OUTPUT_NORM,
  197. LLM_TENSOR_ROPE_FREQS,
  198. LLM_TENSOR_ROPE_FACTORS_LONG,
  199. LLM_TENSOR_ROPE_FACTORS_SHORT,
  200. LLM_TENSOR_ATTN_Q,
  201. LLM_TENSOR_ATTN_K,
  202. LLM_TENSOR_ATTN_V,
  203. LLM_TENSOR_ATTN_QKV,
  204. LLM_TENSOR_ATTN_OUT,
  205. LLM_TENSOR_ATTN_NORM,
  206. LLM_TENSOR_ATTN_NORM_2,
  207. LLM_TENSOR_ATTN_OUT_NORM,
  208. LLM_TENSOR_ATTN_POST_NORM,
  209. LLM_TENSOR_ATTN_ROT_EMBD,
  210. LLM_TENSOR_FFN_GATE_INP,
  211. LLM_TENSOR_FFN_GATE_INP_SHEXP,
  212. LLM_TENSOR_FFN_NORM,
  213. LLM_TENSOR_FFN_POST_NORM,
  214. LLM_TENSOR_FFN_GATE,
  215. LLM_TENSOR_FFN_DOWN,
  216. LLM_TENSOR_FFN_UP,
  217. LLM_TENSOR_FFN_ACT,
  218. LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
  219. LLM_TENSOR_FFN_GATE_EXP,
  220. LLM_TENSOR_FFN_UP_EXP,
  221. LLM_TENSOR_FFN_NORM_EXPS,
  222. LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
  223. LLM_TENSOR_FFN_GATE_EXPS,
  224. LLM_TENSOR_FFN_UP_EXPS,
  225. LLM_TENSOR_FFN_DOWN_SHEXP,
  226. LLM_TENSOR_FFN_GATE_SHEXP,
  227. LLM_TENSOR_FFN_UP_SHEXP,
  228. LLM_TENSOR_FFN_EXP_PROBS_B,
  229. LLM_TENSOR_ATTN_Q_NORM,
  230. LLM_TENSOR_ATTN_K_NORM,
  231. LLM_TENSOR_LAYER_OUT_NORM,
  232. LLM_TENSOR_SSM_IN,
  233. LLM_TENSOR_SSM_CONV1D,
  234. LLM_TENSOR_SSM_X,
  235. LLM_TENSOR_SSM_DT,
  236. LLM_TENSOR_SSM_A,
  237. LLM_TENSOR_SSM_D,
  238. LLM_TENSOR_SSM_OUT,
  239. LLM_TENSOR_TIME_MIX_W1,
  240. LLM_TENSOR_TIME_MIX_W2,
  241. LLM_TENSOR_TIME_MIX_LERP_X,
  242. LLM_TENSOR_TIME_MIX_LERP_W,
  243. LLM_TENSOR_TIME_MIX_LERP_K,
  244. LLM_TENSOR_TIME_MIX_LERP_V,
  245. LLM_TENSOR_TIME_MIX_LERP_R,
  246. LLM_TENSOR_TIME_MIX_LERP_G,
  247. LLM_TENSOR_TIME_MIX_LERP_FUSED,
  248. LLM_TENSOR_TIME_MIX_FIRST,
  249. LLM_TENSOR_TIME_MIX_DECAY,
  250. LLM_TENSOR_TIME_MIX_DECAY_W1,
  251. LLM_TENSOR_TIME_MIX_DECAY_W2,
  252. LLM_TENSOR_TIME_MIX_KEY,
  253. LLM_TENSOR_TIME_MIX_VALUE,
  254. LLM_TENSOR_TIME_MIX_RECEPTANCE,
  255. LLM_TENSOR_TIME_MIX_GATE,
  256. LLM_TENSOR_TIME_MIX_LN,
  257. LLM_TENSOR_TIME_MIX_OUTPUT,
  258. LLM_TENSOR_CHANNEL_MIX_LERP_K,
  259. LLM_TENSOR_CHANNEL_MIX_LERP_R,
  260. LLM_TENSOR_CHANNEL_MIX_KEY,
  261. LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
  262. LLM_TENSOR_CHANNEL_MIX_VALUE,
  263. LLM_TENSOR_ATTN_Q_A,
  264. LLM_TENSOR_ATTN_Q_B,
  265. LLM_TENSOR_ATTN_KV_A_MQA,
  266. LLM_TENSOR_ATTN_KV_B,
  267. LLM_TENSOR_ATTN_Q_A_NORM,
  268. LLM_TENSOR_ATTN_KV_A_NORM,
  269. LLM_TENSOR_ATTN_SUB_NORM,
  270. LLM_TENSOR_FFN_SUB_NORM,
  271. LLM_TENSOR_DEC_ATTN_NORM,
  272. LLM_TENSOR_DEC_ATTN_Q,
  273. LLM_TENSOR_DEC_ATTN_K,
  274. LLM_TENSOR_DEC_ATTN_V,
  275. LLM_TENSOR_DEC_ATTN_OUT,
  276. LLM_TENSOR_DEC_ATTN_REL_B,
  277. LLM_TENSOR_DEC_CROSS_ATTN_NORM,
  278. LLM_TENSOR_DEC_CROSS_ATTN_Q,
  279. LLM_TENSOR_DEC_CROSS_ATTN_K,
  280. LLM_TENSOR_DEC_CROSS_ATTN_V,
  281. LLM_TENSOR_DEC_CROSS_ATTN_OUT,
  282. LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
  283. LLM_TENSOR_DEC_FFN_NORM,
  284. LLM_TENSOR_DEC_FFN_GATE,
  285. LLM_TENSOR_DEC_FFN_DOWN,
  286. LLM_TENSOR_DEC_FFN_UP,
  287. LLM_TENSOR_DEC_OUTPUT_NORM,
  288. LLM_TENSOR_ENC_ATTN_NORM,
  289. LLM_TENSOR_ENC_ATTN_Q,
  290. LLM_TENSOR_ENC_ATTN_K,
  291. LLM_TENSOR_ENC_ATTN_V,
  292. LLM_TENSOR_ENC_ATTN_OUT,
  293. LLM_TENSOR_ENC_ATTN_REL_B,
  294. LLM_TENSOR_ENC_FFN_NORM,
  295. LLM_TENSOR_ENC_FFN_GATE,
  296. LLM_TENSOR_ENC_FFN_DOWN,
  297. LLM_TENSOR_ENC_FFN_UP,
  298. LLM_TENSOR_ENC_OUTPUT_NORM,
  299. LLM_TENSOR_CLS,
  300. LLM_TENSOR_CLS_OUT,
  301. LLM_TENSOR_BSKCN_TV,
  302. LLM_TENSOR_CROSS_ATTN_K_NORM,
  303. LLM_TENSOR_CROSS_ATTN_K_PROJ,
  304. LLM_TENSOR_CROSS_ATTN_O_PROJ,
  305. LLM_TENSOR_CROSS_ATTN_Q_NORM,
  306. LLM_TENSOR_CROSS_ATTN_Q_PROJ,
  307. LLM_TENSOR_CROSS_ATTN_V_PROJ,
  308. LLM_TENSOR_CROSS_ATTN_ATTN_GATE,
  309. LLM_TENSOR_CROSS_ATTN_MLP_GATE,
  310. LLM_TENSOR_CONV1D,
  311. LLM_TENSOR_CONVNEXT_DW,
  312. LLM_TENSOR_CONVNEXT_NORM,
  313. LLM_TENSOR_CONVNEXT_PW1,
  314. LLM_TENSOR_CONVNEXT_PW2,
  315. LLM_TENSOR_CONVNEXT_GAMMA,
  316. LLM_TENSOR_POS_NET_CONV1,
  317. LLM_TENSOR_POS_NET_CONV2,
  318. LLM_TENSOR_POS_NET_NORM,
  319. LLM_TENSOR_POS_NET_NORM1,
  320. LLM_TENSOR_POS_NET_NORM2,
  321. LLM_TENSOR_POS_NET_ATTN_NORM,
  322. LLM_TENSOR_POS_NET_ATTN_Q,
  323. LLM_TENSOR_POS_NET_ATTN_K,
  324. LLM_TENSOR_POS_NET_ATTN_V,
  325. LLM_TENSOR_POS_NET_ATTN_OUT,
  326. };
  327. enum llm_tensor_layer {
  328. LLM_TENSOR_LAYER_INPUT,
  329. LLM_TENSOR_LAYER_REPEATING,
  330. LLM_TENSOR_LAYER_OUTPUT,
  331. };
  332. struct LLM_KV {
  333. LLM_KV(llm_arch arch, const char * suffix = nullptr);
  334. llm_arch arch;
  335. const char * suffix;
  336. std::string operator()(llm_kv kv) const;
  337. };
  338. // helper to handle gguf constants
  339. // usage:
  340. //
  341. // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
  342. //
  343. // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
  344. // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
  345. // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
  346. //
  347. struct LLM_TN_IMPL {
  348. const llm_arch arch;
  349. const llm_tensor tensor;
  350. const char * const suffix;
  351. const int bid;
  352. const int xid;
  353. std::string str() const;
  354. operator std::string() const {
  355. return str();
  356. }
  357. friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
  358. return str == tn.str();
  359. }
  360. friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
  361. return str != tn.str();
  362. }
  363. };
  364. struct LLM_TN {
  365. LLM_TN(llm_arch arch) : arch(arch) {}
  366. llm_arch arch;
  367. LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
  368. return { arch, tensor, suffix, bid, xid };
  369. }
  370. LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
  371. return { arch, tensor, nullptr, bid, xid };
  372. }
  373. };
  374. struct llm_tensor_info {
  375. llm_tensor_layer layer;
  376. ggml_op op;
  377. };
  378. const char * llm_arch_name(llm_arch arch);
  379. llm_arch llm_arch_from_string(const std::string & name);
  380. const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);