llama-model.cpp 102 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205
  1. /**
  2. * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "llama-model.h"
  27. #include "llama-impl.h"
  28. #include "llama-model-loader.h"
  29. #include "unicode.h" // TODO: remove
  30. #include <algorithm>
  31. #include <cassert>
  32. #include <functional>
  33. #include <sstream>
  34. #include <stdexcept>
  35. static const size_t kiB = 1024;
  36. static const size_t MiB = 1024*kiB;
  37. static const size_t GiB = 1024*MiB;
  38. const char * llm_type_name(llm_type type) {
  39. switch (type) {
  40. case MODEL_14M: return "14M";
  41. case MODEL_17M: return "17M";
  42. case MODEL_22M: return "22M";
  43. case MODEL_33M: return "33M";
  44. case MODEL_60M: return "60M";
  45. case MODEL_70M: return "70M";
  46. case MODEL_80M: return "80M";
  47. case MODEL_109M: return "109M";
  48. case MODEL_137M: return "137M";
  49. case MODEL_160M: return "160M";
  50. case MODEL_220M: return "220M";
  51. case MODEL_250M: return "250M";
  52. case MODEL_270M: return "270M";
  53. case MODEL_335M: return "335M";
  54. case MODEL_410M: return "410M";
  55. case MODEL_450M: return "450M";
  56. case MODEL_770M: return "770M";
  57. case MODEL_780M: return "780M";
  58. case MODEL_0_5B: return "0.5B";
  59. case MODEL_1B: return "1B";
  60. case MODEL_1_3B: return "1.3B";
  61. case MODEL_1_4B: return "1.4B";
  62. case MODEL_1_5B: return "1.5B";
  63. case MODEL_1_6B: return "1.6B";
  64. case MODEL_2B: return "2B";
  65. case MODEL_2_8B: return "2.8B";
  66. case MODEL_3B: return "3B";
  67. case MODEL_4B: return "4B";
  68. case MODEL_6B: return "6B";
  69. case MODEL_6_9B: return "6.9B";
  70. case MODEL_7B: return "7B";
  71. case MODEL_8B: return "8B";
  72. case MODEL_9B: return "9B";
  73. case MODEL_11B: return "11B";
  74. case MODEL_12B: return "12B";
  75. case MODEL_13B: return "13B";
  76. case MODEL_14B: return "14B";
  77. case MODEL_15B: return "15B";
  78. case MODEL_16B: return "16B";
  79. case MODEL_20B: return "20B";
  80. case MODEL_30B: return "30B";
  81. case MODEL_32B: return "32B";
  82. case MODEL_34B: return "34B";
  83. case MODEL_35B: return "35B";
  84. case MODEL_40B: return "40B";
  85. case MODEL_65B: return "65B";
  86. case MODEL_70B: return "70B";
  87. case MODEL_236B: return "236B";
  88. case MODEL_314B: return "314B";
  89. case MODEL_671B: return "671B";
  90. case MODEL_SMALL: return "0.1B";
  91. case MODEL_MEDIUM: return "0.4B";
  92. case MODEL_LARGE: return "0.8B";
  93. case MODEL_XL: return "1.5B";
  94. case MODEL_A1_7B: return "A1.7B";
  95. case MODEL_A2_7B: return "A2.7B";
  96. case MODEL_8x7B: return "8x7B";
  97. case MODEL_8x22B: return "8x22B";
  98. case MODEL_16x12B: return "16x12B";
  99. case MODEL_10B_128x3_66B: return "10B+128x3.66B";
  100. case MODEL_57B_A14B: return "57B.A14B";
  101. case MODEL_27B: return "27B";
  102. default: return "?B";
  103. }
  104. }
  105. static std::string llama_model_ftype_name(llama_ftype ftype) {
  106. if (ftype & LLAMA_FTYPE_GUESSED) {
  107. return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
  108. }
  109. switch (ftype) {
  110. case LLAMA_FTYPE_ALL_F32: return "all F32";
  111. case LLAMA_FTYPE_MOSTLY_F16: return "F16";
  112. case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
  113. case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
  114. case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
  115. case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
  116. case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
  117. case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  118. case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
  119. case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
  120. case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
  121. case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
  122. case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
  123. case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
  124. case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
  125. case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
  126. case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
  127. case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
  128. case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
  129. case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
  130. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
  131. case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
  132. case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
  133. case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
  134. case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
  135. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
  136. case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
  137. case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
  138. case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
  139. case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
  140. case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
  141. case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
  142. default: return "unknown, may not work";
  143. }
  144. }
  145. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  146. switch (type) {
  147. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  148. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  149. default: return "unknown";
  150. }
  151. }
  152. std::string llama_model_arch_name (const llama_model & model) {
  153. return llm_arch_name(model.arch);
  154. }
  155. std::string llama_model_type_name (const llama_model & model) {
  156. return llm_type_name(model.type);
  157. }
  158. std::string llama_model_ftype_name(const llama_model & model) {
  159. return llama_model_ftype_name(model.ftype);
  160. }
  161. ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
  162. return select_buft(
  163. *model.dev_layer.at(il).buft_list,
  164. [&](ggml_context * ctx) {
  165. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  166. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  167. return ggml_add(ctx, cur, layer_dir);
  168. });
  169. }
  170. struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) {
  171. auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(),
  172. [name](const std::pair<std::string, struct ggml_tensor *> & it) {
  173. return it.first == name;
  174. });
  175. if (it == model.tensors_by_name.end()) {
  176. return nullptr;
  177. }
  178. return it->second;
  179. }
  180. size_t llama_model_max_nodes(const llama_model & model) {
  181. return std::max<size_t>(8192, model.tensors_by_name.size()*5);
  182. }
  183. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  184. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  185. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  186. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  187. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  188. };
  189. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  190. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  191. if (kv.second == name) {
  192. return (llama_rope_scaling_type) kv.first;
  193. }
  194. }
  195. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  196. }
  197. // NOTE: avoid ever using this except for building the token_to_piece caches
  198. static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
  199. std::string piece;
  200. piece.resize(piece.capacity()); // using string internal cache
  201. const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  202. if (n_chars < 0) {
  203. piece.resize(-n_chars);
  204. int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  205. GGML_ASSERT(check == -n_chars);
  206. }
  207. else {
  208. piece.resize(n_chars);
  209. }
  210. return piece;
  211. }
  212. void llm_load_stats(llama_model_loader & ml, llama_model & model) {
  213. model.n_elements = ml.n_elements;
  214. model.n_bytes = ml.n_bytes;
  215. }
  216. void llm_load_arch(llama_model_loader & ml, llama_model & model) {
  217. model.arch = ml.get_arch();
  218. if (model.arch == LLM_ARCH_UNKNOWN) {
  219. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  220. }
  221. }
  222. void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
  223. auto & hparams = model.hparams;
  224. const gguf_context * ctx = ml.meta.get();
  225. // get metadata as string
  226. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  227. enum gguf_type type = gguf_get_kv_type(ctx, i);
  228. if (type == GGUF_TYPE_ARRAY) {
  229. continue;
  230. }
  231. const char * name = gguf_get_key(ctx, i);
  232. const std::string value = gguf_kv_to_str(ctx, i);
  233. model.gguf_kv.emplace(name, value);
  234. }
  235. // get general kv
  236. ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
  237. // get hparams kv
  238. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
  239. // everything past this point is not vocab-related
  240. if (hparams.vocab_only) {
  241. return;
  242. }
  243. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  244. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  245. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  246. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  247. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  248. if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  249. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  250. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  251. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  252. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  253. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  254. }
  255. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  256. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  257. if (hparams.n_expert > 0) {
  258. GGML_ASSERT(hparams.n_expert_used > 0);
  259. } else {
  260. GGML_ASSERT(hparams.n_expert_used == 0);
  261. }
  262. // zero-out the array hparams
  263. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  264. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  265. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  266. std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
  267. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  268. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  269. ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
  270. // n_head_kv is optional, default to n_head
  271. hparams.n_head_kv_arr = hparams.n_head_arr;
  272. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  273. bool rope_finetuned = false;
  274. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  275. hparams.rope_finetuned = rope_finetuned;
  276. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  277. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  278. // rope_freq_base (optional)
  279. hparams.rope_freq_base_train = 10000.0f;
  280. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  281. std::string rope_scaling("linear");
  282. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  283. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  284. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  285. // rope_freq_scale (inverse of the kv) is optional
  286. float ropescale = 0.0f;
  287. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  288. // try the old key name
  289. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  290. }
  291. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  292. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  293. // non-transformer models do not have attention heads
  294. if (hparams.n_head() > 0) {
  295. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  296. // gpt-j n_rot = rotary_dim
  297. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  298. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  299. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  300. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  301. // sanity check for n_rot (optional)
  302. hparams.n_rot = hparams.n_embd_head_k;
  303. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  304. if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_MLLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
  305. if (hparams.n_rot != hparams.n_embd_head_k) {
  306. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  307. }
  308. }
  309. } else {
  310. hparams.n_rot = 0;
  311. hparams.n_embd_head_k = 0;
  312. hparams.n_embd_head_v = 0;
  313. }
  314. using e_model = llm_type; // TMP
  315. // arch-specific KVs
  316. switch (model.arch) {
  317. case LLM_ARCH_LLAMA:
  318. {
  319. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  320. if (hparams.n_expert == 8) {
  321. switch (hparams.n_layer) {
  322. case 32: model.type = e_model::MODEL_8x7B; break;
  323. case 56: model.type = e_model::MODEL_8x22B; break;
  324. default: model.type = e_model::MODEL_UNKNOWN;
  325. }
  326. } else {
  327. switch (hparams.n_layer) {
  328. case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
  329. case 22: model.type = e_model::MODEL_1B; break;
  330. case 26: model.type = e_model::MODEL_3B; break;
  331. case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
  332. // granite uses a vocab with len 49152
  333. case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
  334. case 36: model.type = e_model::MODEL_8B; break; // granite
  335. case 40: model.type = e_model::MODEL_13B; break;
  336. case 48: model.type = e_model::MODEL_34B; break;
  337. case 60: model.type = e_model::MODEL_30B; break;
  338. case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
  339. default: model.type = e_model::MODEL_UNKNOWN;
  340. }
  341. }
  342. } break;
  343. case LLM_ARCH_MLLAMA:
  344. {
  345. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  346. switch (hparams.n_layer) {
  347. case 40: model.type = e_model::MODEL_11B; break;
  348. case 100: model.type = e_model::MODEL_90B; break;
  349. default: model.type = e_model::MODEL_UNKNOWN;
  350. }
  351. } break;
  352. case LLM_ARCH_DECI:
  353. {
  354. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  355. switch (hparams.n_layer) {
  356. case 32: model.type = e_model::MODEL_7B; break;
  357. case 80: model.type = e_model::MODEL_70B; break;
  358. default: model.type = e_model::MODEL_UNKNOWN;
  359. }
  360. } break;
  361. case LLM_ARCH_MINICPM:
  362. {
  363. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  364. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  365. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  366. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  367. switch (hparams.n_layer) {
  368. case 52: model.type = e_model::MODEL_1B; break;
  369. case 40: model.type = e_model::MODEL_2B; break;
  370. default: model.type = e_model::MODEL_UNKNOWN;
  371. }
  372. } break;
  373. case LLM_ARCH_MINICPM3:
  374. {
  375. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  376. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  377. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  378. switch (hparams.n_layer) {
  379. case 62: model.type = e_model::MODEL_4B; break;
  380. default: model.type = e_model::MODEL_UNKNOWN;
  381. }
  382. } break;
  383. case LLM_ARCH_GROK:
  384. {
  385. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  386. switch (hparams.n_layer) {
  387. case 64: model.type = e_model::MODEL_314B; break;
  388. default: model.type = e_model::MODEL_UNKNOWN;
  389. }
  390. } break;
  391. case LLM_ARCH_FALCON:
  392. {
  393. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  394. switch (hparams.n_layer) {
  395. case 32: model.type = e_model::MODEL_7B; break;
  396. case 60: model.type = e_model::MODEL_40B; break;
  397. default: model.type = e_model::MODEL_UNKNOWN;
  398. }
  399. } break;
  400. case LLM_ARCH_BAICHUAN:
  401. {
  402. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  403. switch (hparams.n_layer) {
  404. case 32: model.type = e_model::MODEL_7B; break;
  405. case 40: model.type = e_model::MODEL_13B; break;
  406. default: model.type = e_model::MODEL_UNKNOWN;
  407. }
  408. if (model.type == e_model::MODEL_13B) {
  409. // TODO: become GGUF KV parameter
  410. hparams.f_max_alibi_bias = 8.0f;
  411. }
  412. } break;
  413. case LLM_ARCH_STARCODER:
  414. {
  415. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  416. switch (hparams.n_layer) {
  417. case 24: model.type = e_model::MODEL_1B; break;
  418. case 36: model.type = e_model::MODEL_3B; break;
  419. case 42: model.type = e_model::MODEL_7B; break;
  420. case 40: model.type = e_model::MODEL_15B; break;
  421. default: model.type = e_model::MODEL_UNKNOWN;
  422. }
  423. } break;
  424. case LLM_ARCH_REFACT:
  425. {
  426. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  427. switch (hparams.n_layer) {
  428. case 32: model.type = e_model::MODEL_1B; break;
  429. default: model.type = e_model::MODEL_UNKNOWN;
  430. }
  431. // TODO: become GGUF KV parameter
  432. hparams.f_max_alibi_bias = 8.0f;
  433. } break;
  434. case LLM_ARCH_BERT:
  435. {
  436. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  437. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  438. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  439. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  440. switch (hparams.n_layer) {
  441. case 3:
  442. model.type = e_model::MODEL_17M; break; // bge-micro
  443. case 6:
  444. model.type = e_model::MODEL_22M; break; // MiniLM-L6
  445. case 12:
  446. switch (hparams.n_embd) {
  447. case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
  448. case 768: model.type = e_model::MODEL_109M; break; // bge-base
  449. default: model.type = e_model::MODEL_UNKNOWN;
  450. } break;
  451. case 24:
  452. model.type = e_model::MODEL_335M; break; // bge-large
  453. default: model.type = e_model::MODEL_UNKNOWN;
  454. }
  455. } break;
  456. case LLM_ARCH_JINA_BERT_V2:
  457. {
  458. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  459. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  460. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  461. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  462. hparams.f_max_alibi_bias = 8.0f;
  463. switch (hparams.n_layer) {
  464. case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
  465. case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
  466. default: model.type = e_model::MODEL_UNKNOWN;
  467. }
  468. } break;
  469. case LLM_ARCH_NOMIC_BERT:
  470. {
  471. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  472. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  473. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  474. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  475. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  476. model.type = e_model::MODEL_137M;
  477. }
  478. } break;
  479. case LLM_ARCH_BLOOM:
  480. {
  481. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  482. switch (hparams.n_layer) {
  483. case 24: model.type = e_model::MODEL_1B; break;
  484. case 30:
  485. switch (hparams.n_embd) {
  486. case 2560: model.type = e_model::MODEL_3B; break;
  487. case 4096: model.type = e_model::MODEL_7B; break;
  488. default: model.type = e_model::MODEL_UNKNOWN;
  489. } break;
  490. default: model.type = e_model::MODEL_UNKNOWN;
  491. }
  492. // TODO: become GGUF KV parameter
  493. hparams.f_max_alibi_bias = 8.0f;
  494. } break;
  495. case LLM_ARCH_MPT:
  496. {
  497. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  498. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  499. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  500. switch (hparams.n_layer) {
  501. case 32: model.type = e_model::MODEL_7B; break;
  502. case 48: model.type = e_model::MODEL_30B; break;
  503. default: model.type = e_model::MODEL_UNKNOWN;
  504. }
  505. } break;
  506. case LLM_ARCH_STABLELM:
  507. {
  508. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  509. switch (hparams.n_layer) {
  510. case 24: model.type = e_model::MODEL_1B; break;
  511. case 32: model.type = e_model::MODEL_3B; break;
  512. case 40: model.type = e_model::MODEL_12B; break;
  513. default: model.type = e_model::MODEL_UNKNOWN;
  514. }
  515. } break;
  516. case LLM_ARCH_QWEN:
  517. {
  518. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  519. switch (hparams.n_layer) {
  520. case 32: model.type = e_model::MODEL_7B; break;
  521. case 40: model.type = e_model::MODEL_13B; break;
  522. default: model.type = e_model::MODEL_UNKNOWN;
  523. }
  524. } break;
  525. case LLM_ARCH_QWEN2VL:
  526. {
  527. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  528. }
  529. // fall through
  530. case LLM_ARCH_QWEN2:
  531. {
  532. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  533. switch (hparams.n_layer) {
  534. case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
  535. case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
  536. case 32: model.type = e_model::MODEL_7B; break;
  537. case 36: model.type = e_model::MODEL_3B; break;
  538. case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
  539. case 48: model.type = e_model::MODEL_14B; break;
  540. case 64: model.type = e_model::MODEL_32B; break;
  541. case 80: model.type = e_model::MODEL_70B; break;
  542. default: model.type = e_model::MODEL_UNKNOWN;
  543. }
  544. } break;
  545. case LLM_ARCH_QWEN2MOE:
  546. {
  547. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  548. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  549. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  550. switch (hparams.n_layer) {
  551. case 24: model.type = e_model::MODEL_A2_7B; break;
  552. case 28: model.type = e_model::MODEL_57B_A14B; break;
  553. default: model.type = e_model::MODEL_UNKNOWN;
  554. }
  555. } break;
  556. case LLM_ARCH_PHI2:
  557. {
  558. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  559. switch (hparams.n_layer) {
  560. case 24: model.type = e_model::MODEL_1B; break;
  561. case 32: model.type = e_model::MODEL_3B; break;
  562. default: model.type = e_model::MODEL_UNKNOWN;
  563. }
  564. } break;
  565. case LLM_ARCH_PHI3:
  566. {
  567. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  568. switch (hparams.n_layer) {
  569. case 24: model.type = e_model::MODEL_1B; break;
  570. case 32: model.type = e_model::MODEL_3B; break;
  571. case 40: model.type = e_model::MODEL_14B; break;
  572. default: model.type = e_model::MODEL_UNKNOWN;
  573. }
  574. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  575. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  576. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  577. hparams.n_swa = 2047;
  578. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  579. // default value for Phi-3-mini-128k-instruct
  580. hparams.n_swa = 262144;
  581. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  582. // default value for Phi-3-medium-128k-instruct
  583. hparams.n_swa = 131072;
  584. }
  585. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  586. if (!found_swa && hparams.n_swa == 0) {
  587. throw std::runtime_error("invalid value for sliding_window");
  588. }
  589. } break;
  590. case LLM_ARCH_PLAMO:
  591. {
  592. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  593. switch (hparams.n_layer) {
  594. case 40: model.type = e_model::MODEL_13B; break;
  595. default: model.type = e_model::MODEL_UNKNOWN;
  596. }
  597. } break;
  598. case LLM_ARCH_GPT2:
  599. {
  600. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  601. switch (hparams.n_layer) {
  602. case 12: model.type = e_model::MODEL_SMALL; break;
  603. case 24: model.type = e_model::MODEL_MEDIUM; break;
  604. case 36: model.type = e_model::MODEL_LARGE; break;
  605. case 48: model.type = e_model::MODEL_XL; break;
  606. default: model.type = e_model::MODEL_UNKNOWN;
  607. }
  608. } break;
  609. case LLM_ARCH_CODESHELL:
  610. {
  611. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  612. switch (hparams.n_layer) {
  613. case 42: model.type = e_model::MODEL_7B; break;
  614. default: model.type = e_model::MODEL_UNKNOWN;
  615. }
  616. } break;
  617. case LLM_ARCH_ORION:
  618. {
  619. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  620. switch (hparams.n_layer) {
  621. case 40: model.type = e_model::MODEL_14B; break;
  622. default: model.type = e_model::MODEL_UNKNOWN;
  623. }
  624. } break;
  625. case LLM_ARCH_INTERNLM2:
  626. {
  627. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  628. switch (hparams.n_layer) {
  629. case 32: model.type = e_model::MODEL_7B; break;
  630. case 48: model.type = e_model::MODEL_20B; break;
  631. default: model.type = e_model::MODEL_UNKNOWN;
  632. }
  633. } break;
  634. case LLM_ARCH_GEMMA:
  635. {
  636. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  637. switch (hparams.n_layer) {
  638. case 18: model.type = e_model::MODEL_2B; break;
  639. case 28: model.type = e_model::MODEL_7B; break;
  640. default: model.type = e_model::MODEL_UNKNOWN;
  641. }
  642. } break;
  643. case LLM_ARCH_GEMMA2:
  644. {
  645. hparams.n_swa = 4096; // default value of gemma 2
  646. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  647. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  648. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  649. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  650. hparams.attn_soft_cap = true;
  651. switch (hparams.n_layer) {
  652. case 26: model.type = e_model::MODEL_2B; break;
  653. case 42: model.type = e_model::MODEL_9B; break;
  654. case 46: model.type = e_model::MODEL_27B; break;
  655. default: model.type = e_model::MODEL_UNKNOWN;
  656. }
  657. } break;
  658. case LLM_ARCH_STARCODER2:
  659. {
  660. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  661. switch (hparams.n_layer) {
  662. case 30: model.type = e_model::MODEL_3B; break;
  663. case 32: model.type = e_model::MODEL_7B; break;
  664. case 40: model.type = e_model::MODEL_15B; break;
  665. case 52: model.type = e_model::MODEL_20B; break; // granite
  666. case 88: model.type = e_model::MODEL_34B; break; // granite
  667. default: model.type = e_model::MODEL_UNKNOWN;
  668. }
  669. } break;
  670. case LLM_ARCH_MAMBA:
  671. {
  672. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  673. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  674. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  675. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  676. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  677. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  678. switch (hparams.n_layer) {
  679. case 24:
  680. switch (hparams.n_embd) {
  681. case 768: model.type = e_model::MODEL_SMALL; break;
  682. default: model.type = e_model::MODEL_UNKNOWN;
  683. } break;
  684. case 48:
  685. switch (hparams.n_embd) {
  686. case 1024: model.type = e_model::MODEL_MEDIUM; break;
  687. case 1536: model.type = e_model::MODEL_LARGE; break;
  688. case 2048: model.type = e_model::MODEL_XL; break;
  689. default: model.type = e_model::MODEL_UNKNOWN;
  690. } break;
  691. case 64:
  692. switch (hparams.n_embd) {
  693. case 2560: model.type = e_model::MODEL_3B; break;
  694. default: model.type = e_model::MODEL_UNKNOWN;
  695. } break;
  696. default: model.type = e_model::MODEL_UNKNOWN;
  697. }
  698. } break;
  699. case LLM_ARCH_XVERSE:
  700. {
  701. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  702. switch (hparams.n_layer) {
  703. case 32: model.type = e_model::MODEL_7B; break;
  704. case 40: model.type = e_model::MODEL_13B; break;
  705. case 80: model.type = e_model::MODEL_65B; break;
  706. default: model.type = e_model::MODEL_UNKNOWN;
  707. }
  708. } break;
  709. case LLM_ARCH_COMMAND_R:
  710. {
  711. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  712. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  713. switch (hparams.n_layer) {
  714. case 40: model.type = e_model::MODEL_35B; break;
  715. default: model.type = e_model::MODEL_UNKNOWN;
  716. }
  717. } break;
  718. case LLM_ARCH_COHERE2:
  719. {
  720. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  721. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  722. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  723. switch (hparams.n_layer) {
  724. case 32: model.type = e_model::MODEL_8B; break;
  725. default: model.type = e_model::MODEL_UNKNOWN;
  726. }
  727. } break;
  728. case LLM_ARCH_DBRX:
  729. {
  730. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  731. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  732. switch (hparams.n_layer) {
  733. case 40: model.type = e_model::MODEL_16x12B; break;
  734. default: model.type = e_model::MODEL_UNKNOWN;
  735. }
  736. } break;
  737. case LLM_ARCH_OLMO:
  738. {
  739. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  740. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  741. switch (hparams.n_layer) {
  742. case 22: model.type = e_model::MODEL_1B; break;
  743. case 32: model.type = e_model::MODEL_7B; break;
  744. case 80: model.type = e_model::MODEL_70B; break;
  745. default: model.type = e_model::MODEL_UNKNOWN;
  746. }
  747. } break;
  748. case LLM_ARCH_OLMO2:
  749. {
  750. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  751. switch (hparams.n_layer) {
  752. case 16: model.type = e_model::MODEL_1B; break;
  753. case 32: model.type = e_model::MODEL_7B; break;
  754. case 40: model.type = e_model::MODEL_13B; break;
  755. default: model.type = e_model::MODEL_UNKNOWN;
  756. }
  757. } break;
  758. case LLM_ARCH_OLMOE:
  759. {
  760. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  761. switch (hparams.n_layer) {
  762. case 16: model.type = e_model::MODEL_A1_7B; break;
  763. default: model.type = e_model::MODEL_UNKNOWN;
  764. }
  765. } break;
  766. case LLM_ARCH_OPENELM:
  767. {
  768. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  769. switch (hparams.n_layer) {
  770. case 16: model.type = e_model::MODEL_270M; break;
  771. case 20: model.type = e_model::MODEL_450M; break;
  772. case 28: model.type = e_model::MODEL_1B; break;
  773. case 36: model.type = e_model::MODEL_3B; break;
  774. default: model.type = e_model::MODEL_UNKNOWN;
  775. }
  776. } break;
  777. case LLM_ARCH_GPTNEOX:
  778. {
  779. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  780. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  781. switch (hparams.n_layer) {
  782. case 6:
  783. switch (hparams.n_ff()) {
  784. case 512: model.type = e_model::MODEL_14M; break;
  785. case 2048: model.type = e_model::MODEL_70M; break;
  786. default: model.type = e_model::MODEL_UNKNOWN;
  787. } break;
  788. case 12:
  789. switch (hparams.n_ff()) {
  790. case 3072: model.type = e_model::MODEL_160M; break;
  791. default: model.type = e_model::MODEL_UNKNOWN;
  792. } break;
  793. case 16:
  794. switch (hparams.n_ff()) {
  795. case 8192: model.type = e_model::MODEL_1B; break;
  796. default: model.type = e_model::MODEL_UNKNOWN;
  797. } break;
  798. case 24:
  799. switch (hparams.n_ff()) {
  800. case 4096: model.type = e_model::MODEL_410M; break;
  801. case 8192: model.type = e_model::MODEL_1_4B; break;
  802. default: model.type = e_model::MODEL_UNKNOWN;
  803. } break;
  804. case 32:
  805. switch (hparams.n_ff()) {
  806. case 10240: model.type = e_model::MODEL_2_8B; break;
  807. case 16384: model.type = e_model::MODEL_6_9B; break;
  808. default: model.type = e_model::MODEL_UNKNOWN;
  809. } break;
  810. case 36:
  811. switch (hparams.n_ff()) {
  812. case 20480: model.type = e_model::MODEL_12B; break;
  813. default: model.type = e_model::MODEL_UNKNOWN;
  814. } break;
  815. case 44:
  816. switch (hparams.n_ff()) {
  817. case 24576: model.type = e_model::MODEL_20B; break;
  818. default: model.type = e_model::MODEL_UNKNOWN;
  819. } break;
  820. default: model.type = e_model::MODEL_UNKNOWN;
  821. }
  822. } break;
  823. case LLM_ARCH_ARCTIC:
  824. {
  825. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  826. if (hparams.n_expert == 128) {
  827. switch (hparams.n_layer) {
  828. case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
  829. default: model.type = e_model::MODEL_UNKNOWN;
  830. }
  831. } else {
  832. model.type = e_model::MODEL_UNKNOWN;
  833. }
  834. } break;
  835. case LLM_ARCH_DEEPSEEK:
  836. {
  837. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  838. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  839. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  840. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  841. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  842. switch (hparams.n_layer) {
  843. case 28: model.type = e_model::MODEL_20B; break;
  844. default: model.type = e_model::MODEL_UNKNOWN;
  845. }
  846. } break;
  847. case LLM_ARCH_DEEPSEEK2:
  848. {
  849. bool is_lite = (hparams.n_layer == 27);
  850. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  851. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  852. if (!is_lite) {
  853. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  854. }
  855. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  856. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  857. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  858. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  859. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  860. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  861. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  862. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  863. // that have no expert_gating_func model parameter set
  864. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  865. }
  866. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  867. switch (hparams.n_layer) {
  868. case 27: model.type = e_model::MODEL_16B; break;
  869. case 60: model.type = e_model::MODEL_236B; break;
  870. case 61: model.type = e_model::MODEL_671B; break;
  871. default: model.type = e_model::MODEL_UNKNOWN;
  872. }
  873. } break;
  874. case LLM_ARCH_CHATGLM:
  875. {
  876. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  877. switch (hparams.n_layer) {
  878. case 28: model.type = e_model::MODEL_6B; break;
  879. case 40: model.type = e_model::MODEL_9B; break;
  880. default: model.type = e_model::MODEL_UNKNOWN;
  881. }
  882. } break;
  883. case LLM_ARCH_BITNET:
  884. {
  885. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  886. switch (hparams.n_layer) {
  887. case 26: model.type = e_model::MODEL_3B; break;
  888. default: model.type = e_model::MODEL_UNKNOWN;
  889. }
  890. } break;
  891. case LLM_ARCH_T5:
  892. {
  893. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  894. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  895. uint32_t dec_start_token_id;
  896. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  897. hparams.dec_start_token_id = dec_start_token_id;
  898. }
  899. switch (hparams.n_layer) {
  900. case 6: model.type = e_model::MODEL_60M; break; // t5-small
  901. case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
  902. case 12:
  903. switch (hparams.n_ff()) {
  904. case 3072: model.type = e_model::MODEL_220M; break; // t5-base
  905. case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
  906. default: model.type = e_model::MODEL_UNKNOWN;
  907. } break;
  908. case 24:
  909. switch (hparams.n_ff()) {
  910. case 4096: model.type = e_model::MODEL_770M; break; // t5-large
  911. case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
  912. case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
  913. case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
  914. case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
  915. case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
  916. default: model.type = e_model::MODEL_UNKNOWN;
  917. } break;
  918. default: model.type = e_model::MODEL_UNKNOWN;
  919. }
  920. } break;
  921. case LLM_ARCH_T5ENCODER:
  922. {
  923. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  924. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  925. model.type = e_model::MODEL_UNKNOWN;
  926. } break;
  927. case LLM_ARCH_JAIS:
  928. {
  929. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  930. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  931. switch (hparams.n_layer) {
  932. case 24: model.type = e_model::MODEL_1_3B; break;
  933. case 40: model.type = e_model::MODEL_13B; break;
  934. /* TODO: add variants */
  935. default: model.type = e_model::MODEL_UNKNOWN;
  936. }
  937. } break;
  938. case LLM_ARCH_NEMOTRON:
  939. {
  940. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  941. switch (hparams.n_layer) {
  942. case 32: model.type = e_model::MODEL_4B; break;
  943. default: model.type = e_model::MODEL_UNKNOWN;
  944. }
  945. } break;
  946. case LLM_ARCH_EXAONE:
  947. {
  948. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  949. switch (hparams.n_layer) {
  950. case 32: model.type = e_model::MODEL_8B; break;
  951. default: model.type = e_model::MODEL_UNKNOWN;
  952. }
  953. } break;
  954. case LLM_ARCH_RWKV6:
  955. {
  956. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  957. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  958. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  959. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  960. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  961. switch (hparams.n_layer) {
  962. case 24: model.type = e_model::MODEL_1_6B; break;
  963. case 32:
  964. switch (hparams.n_embd) {
  965. case 2560: model.type = e_model::MODEL_3B; break;
  966. case 4096: model.type = e_model::MODEL_7B; break;
  967. default: model.type = e_model::MODEL_UNKNOWN;
  968. } break;
  969. case 61: model.type = e_model::MODEL_14B; break;
  970. default: model.type = e_model::MODEL_UNKNOWN;
  971. }
  972. } break;
  973. case LLM_ARCH_GRANITE:
  974. case LLM_ARCH_GRANITE_MOE:
  975. {
  976. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  977. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  978. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  979. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  980. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  981. switch (hparams.n_layer) {
  982. case 32: model.type = e_model::MODEL_3B; break;
  983. case 40: model.type = e_model::MODEL_3B; break;
  984. // Add additional layer/vocab/etc checks here for other model sizes
  985. default: model.type = e_model::MODEL_UNKNOWN;
  986. }
  987. } break;
  988. case LLM_ARCH_CHAMELEON:
  989. {
  990. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  991. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  992. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  993. switch (hparams.n_layer) {
  994. case 32: model.type = e_model::MODEL_7B; break;
  995. case 48: model.type = e_model::MODEL_34B; break;
  996. default: model.type = e_model::MODEL_UNKNOWN;
  997. }
  998. } break;
  999. case LLM_ARCH_SOLAR:
  1000. {
  1001. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1002. for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
  1003. auto & bskcn = hparams.n_bskcn_arr[i];
  1004. bskcn.fill(0);
  1005. auto kv = LLM_KV(model.arch);
  1006. ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
  1007. }
  1008. switch (hparams.n_layer) {
  1009. case 64: model.type = e_model::MODEL_22B; break;
  1010. default: model.type = e_model::MODEL_UNKNOWN;
  1011. }
  1012. } break;
  1013. case LLM_ARCH_WAVTOKENIZER_DEC:
  1014. {
  1015. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1016. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1017. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1018. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1019. } break;
  1020. default: throw std::runtime_error("unsupported model architecture");
  1021. }
  1022. model.ftype = ml.ftype;
  1023. if (hparams.f_max_alibi_bias > 0.0f) {
  1024. hparams.use_alibi = true;
  1025. }
  1026. hparams.rope_type = llama_rope_type(&model);
  1027. }
  1028. void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
  1029. auto & vocab = model.vocab;
  1030. struct gguf_context * ctx = ml.meta.get();
  1031. const auto kv = LLM_KV(model.arch);
  1032. // determine vocab type
  1033. {
  1034. std::string tokenizer_model;
  1035. std::string tokenizer_pre;
  1036. ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
  1037. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1038. if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
  1039. vocab.type = LLAMA_VOCAB_TYPE_NONE;
  1040. // default special tokens
  1041. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1042. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1043. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1044. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1045. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1046. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1047. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1048. vocab.linefeed_id = LLAMA_TOKEN_NULL;
  1049. // read vocab size from metadata
  1050. if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
  1051. vocab.n_vocab = 0;
  1052. LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
  1053. }
  1054. return;
  1055. }
  1056. if (tokenizer_model == "llama") {
  1057. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  1058. // default special tokens
  1059. vocab.special_bos_id = 1;
  1060. vocab.special_eos_id = 2;
  1061. vocab.special_unk_id = 0;
  1062. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1063. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1064. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1065. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1066. } else if (tokenizer_model == "bert") {
  1067. vocab.type = LLAMA_VOCAB_TYPE_WPM;
  1068. // default special tokens
  1069. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1070. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1071. vocab.special_unk_id = 100;
  1072. vocab.special_sep_id = 102;
  1073. vocab.special_pad_id = 0;
  1074. vocab.special_cls_id = 101;
  1075. vocab.special_mask_id = 103;
  1076. } else if (tokenizer_model == "gpt2") {
  1077. vocab.type = LLAMA_VOCAB_TYPE_BPE;
  1078. // read bpe merges and populate bpe ranks
  1079. const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
  1080. if (merges_keyidx == -1) {
  1081. throw std::runtime_error("cannot find tokenizer merges in model file\n");
  1082. }
  1083. const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
  1084. for (int i = 0; i < n_merges; i++) {
  1085. const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
  1086. GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
  1087. std::string first;
  1088. std::string second;
  1089. const size_t pos = word.find(' ', 1);
  1090. if (pos != std::string::npos) {
  1091. first = word.substr(0, pos);
  1092. second = word.substr(pos + 1);
  1093. }
  1094. vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
  1095. }
  1096. // default special tokens
  1097. vocab.special_bos_id = 11;
  1098. vocab.special_eos_id = 11;
  1099. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1100. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1101. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1102. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1103. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1104. } else if (tokenizer_model == "t5") {
  1105. vocab.type = LLAMA_VOCAB_TYPE_UGM;
  1106. // default special tokens
  1107. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1108. vocab.special_eos_id = 1;
  1109. vocab.special_unk_id = 2;
  1110. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1111. vocab.special_pad_id = 0;
  1112. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1113. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1114. const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
  1115. if (precompiled_charsmap_keyidx != -1) {
  1116. size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
  1117. const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
  1118. vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
  1119. #ifdef IS_BIG_ENDIAN
  1120. // correct endiannes of data in precompiled_charsmap binary blob
  1121. uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
  1122. *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
  1123. assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
  1124. size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
  1125. uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
  1126. for (size_t i = 0; i < xcda_array_size; ++i) {
  1127. xcda_array[i] = __builtin_bswap32(xcda_array[i]);
  1128. }
  1129. #endif
  1130. }
  1131. } else if (tokenizer_model == "rwkv") {
  1132. vocab.type = LLAMA_VOCAB_TYPE_RWKV;
  1133. // default special tokens
  1134. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1135. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1136. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1137. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1138. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1139. } else {
  1140. throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
  1141. }
  1142. // for now, only BPE models have pre-tokenizers
  1143. if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
  1144. vocab.tokenizer_add_space_prefix = false;
  1145. vocab.tokenizer_clean_spaces = true;
  1146. if (tokenizer_pre == "default") {
  1147. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1148. } else if (
  1149. tokenizer_pre == "llama3" ||
  1150. tokenizer_pre == "llama-v3" ||
  1151. tokenizer_pre == "llama-bpe"||
  1152. tokenizer_pre == "falcon3") {
  1153. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
  1154. vocab.tokenizer_ignore_merges = true;
  1155. vocab.tokenizer_add_bos = true;
  1156. } else if (
  1157. tokenizer_pre == "deepseek-llm") {
  1158. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
  1159. vocab.tokenizer_clean_spaces = false;
  1160. } else if (
  1161. tokenizer_pre == "deepseek-coder") {
  1162. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
  1163. vocab.tokenizer_clean_spaces = false;
  1164. } else if (
  1165. tokenizer_pre == "deepseek-v3") {
  1166. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
  1167. vocab.tokenizer_clean_spaces = false;
  1168. } else if (
  1169. tokenizer_pre == "falcon") {
  1170. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
  1171. } else if (
  1172. tokenizer_pre == "mpt") {
  1173. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
  1174. } else if (
  1175. tokenizer_pre == "starcoder") {
  1176. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
  1177. } else if (
  1178. tokenizer_pre == "gpt-2" ||
  1179. tokenizer_pre == "phi-2" ||
  1180. tokenizer_pre == "jina-es" ||
  1181. tokenizer_pre == "jina-de" ||
  1182. tokenizer_pre == "gigachat" ||
  1183. tokenizer_pre == "jina-v1-en" ||
  1184. tokenizer_pre == "jina-v2-es" ||
  1185. tokenizer_pre == "jina-v2-de" ||
  1186. tokenizer_pre == "jina-v2-code" ||
  1187. tokenizer_pre == "roberta-bpe") {
  1188. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
  1189. } else if (
  1190. tokenizer_pre == "refact") {
  1191. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
  1192. } else if (
  1193. tokenizer_pre == "command-r") {
  1194. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
  1195. vocab.tokenizer_clean_spaces = false;
  1196. } else if (
  1197. tokenizer_pre == "qwen2") {
  1198. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1199. vocab.tokenizer_clean_spaces = false;
  1200. } else if (
  1201. tokenizer_pre == "stablelm2") {
  1202. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
  1203. } else if (
  1204. tokenizer_pre == "olmo") {
  1205. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
  1206. } else if (
  1207. tokenizer_pre == "dbrx") {
  1208. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
  1209. } else if (
  1210. tokenizer_pre == "smaug-bpe") {
  1211. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
  1212. } else if (
  1213. tokenizer_pre == "poro-chat") {
  1214. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
  1215. vocab.tokenizer_clean_spaces = false;
  1216. } else if (
  1217. tokenizer_pre == "chatglm-bpe") {
  1218. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
  1219. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1220. } else if (
  1221. tokenizer_pre == "viking") {
  1222. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
  1223. vocab.tokenizer_clean_spaces = false;
  1224. } else if (
  1225. tokenizer_pre == "jais") {
  1226. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
  1227. } else if (
  1228. tokenizer_pre == "tekken") {
  1229. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
  1230. vocab.tokenizer_clean_spaces = false;
  1231. vocab.tokenizer_ignore_merges = true;
  1232. vocab.tokenizer_add_bos = true;
  1233. } else if (
  1234. tokenizer_pre == "smollm") {
  1235. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
  1236. vocab.tokenizer_clean_spaces = false;
  1237. } else if (
  1238. tokenizer_pre == "codeshell") {
  1239. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
  1240. } else if (
  1241. tokenizer_pre == "bloom") {
  1242. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
  1243. } else if (
  1244. tokenizer_pre == "gpt3-finnish") {
  1245. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
  1246. } else if (
  1247. tokenizer_pre == "exaone") {
  1248. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
  1249. } else if (
  1250. tokenizer_pre == "chameleon") {
  1251. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
  1252. vocab.tokenizer_add_bos = true;
  1253. vocab.tokenizer_clean_spaces = false;
  1254. } else if (
  1255. tokenizer_pre == "minerva-7b") {
  1256. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
  1257. } else if (
  1258. tokenizer_pre == "megrez") {
  1259. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1260. } else {
  1261. LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
  1262. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1263. }
  1264. } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1265. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1266. vocab.tokenizer_add_space_prefix = true;
  1267. vocab.tokenizer_clean_spaces = false;
  1268. vocab.tokenizer_add_bos = true;
  1269. vocab.tokenizer_add_eos = false;
  1270. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1271. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1272. vocab.tokenizer_add_space_prefix = false;
  1273. vocab.tokenizer_clean_spaces = true;
  1274. vocab.tokenizer_add_bos = true;
  1275. vocab.tokenizer_add_eos = false;
  1276. } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
  1277. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1278. vocab.tokenizer_add_bos = false;
  1279. vocab.tokenizer_add_eos = true;
  1280. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1281. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1282. vocab.tokenizer_add_space_prefix = false;
  1283. vocab.tokenizer_clean_spaces = false;
  1284. vocab.tokenizer_add_bos = false;
  1285. vocab.tokenizer_add_eos = false;
  1286. } else {
  1287. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1288. }
  1289. ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
  1290. ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
  1291. }
  1292. const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
  1293. if (token_idx == -1) {
  1294. throw std::runtime_error("cannot find tokenizer vocab in model file\n");
  1295. }
  1296. const float * scores = nullptr;
  1297. const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
  1298. if (score_idx != -1) {
  1299. scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
  1300. }
  1301. const int * toktypes = nullptr;
  1302. const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
  1303. if (toktype_idx != -1) {
  1304. toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
  1305. }
  1306. const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
  1307. vocab.n_vocab = n_vocab;
  1308. vocab.id_to_token.resize(n_vocab);
  1309. for (uint32_t i = 0; i < n_vocab; i++) {
  1310. std::string word = gguf_get_arr_str(ctx, token_idx, i);
  1311. if (word.empty()) {
  1312. LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
  1313. word = "[EMPTY_" + std::to_string(i) + "]";
  1314. }
  1315. vocab.token_to_id[word] = i;
  1316. vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
  1317. auto & token_data = vocab.id_to_token[i];
  1318. token_data.text = std::move(word);
  1319. token_data.score = scores ? scores[i] : 0.0f;
  1320. token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
  1321. if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
  1322. switch(toktypes[i]) {
  1323. case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
  1324. case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
  1325. case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
  1326. case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
  1327. case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
  1328. case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
  1329. case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1330. default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1331. }
  1332. }
  1333. }
  1334. GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
  1335. vocab.init_tokenizer();
  1336. // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
  1337. if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1338. try {
  1339. vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
  1340. } catch (const std::exception & e) {
  1341. LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
  1342. vocab.linefeed_id = vocab.special_pad_id;
  1343. }
  1344. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1345. vocab.linefeed_id = vocab.special_pad_id;
  1346. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1347. const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
  1348. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1349. vocab.linefeed_id = ids[0];
  1350. } else {
  1351. const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
  1352. //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1353. if (ids.empty()) {
  1354. LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
  1355. vocab.linefeed_id = vocab.special_pad_id;
  1356. } else {
  1357. vocab.linefeed_id = ids[0];
  1358. }
  1359. }
  1360. // special tokens
  1361. {
  1362. const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
  1363. { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
  1364. { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
  1365. { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
  1366. { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
  1367. { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
  1368. { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
  1369. { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
  1370. { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
  1371. { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
  1372. { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
  1373. { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
  1374. { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
  1375. { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
  1376. { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
  1377. { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
  1378. // deprecated
  1379. { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
  1380. { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
  1381. { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
  1382. };
  1383. for (const auto & it : special_token_types) {
  1384. const std::string & key = kv(std::get<0>(it));
  1385. int32_t & id = std::get<1>(it);
  1386. uint32_t new_id;
  1387. if (!ml.get_key(std::get<0>(it), new_id, false)) {
  1388. continue;
  1389. }
  1390. if (new_id >= vocab.id_to_token.size()) {
  1391. LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
  1392. __func__, key.c_str(), new_id, id);
  1393. } else {
  1394. id = new_id;
  1395. }
  1396. }
  1397. // Handle add_bos_token and add_eos_token
  1398. {
  1399. bool temp = true;
  1400. if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
  1401. vocab.tokenizer_add_bos = temp;
  1402. }
  1403. if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
  1404. vocab.tokenizer_add_eos = temp;
  1405. }
  1406. }
  1407. // auto-detect special tokens by text
  1408. // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
  1409. // for now, we apply this workaround to find the tokens based on their text
  1410. for (const auto & t : vocab.token_to_id) {
  1411. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
  1412. if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
  1413. if (false
  1414. || t.first == "<|eot_id|>"
  1415. || t.first == "<|im_end|>"
  1416. || t.first == "<|end|>"
  1417. || t.first == "<end_of_turn>"
  1418. || t.first == "<|endoftext|>"
  1419. || t.first == "<EOT>"
  1420. || t.first == "<|end▁of▁sentence|>" // DeepSeek
  1421. ) {
  1422. vocab.special_eot_id = t.second;
  1423. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1424. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1425. __func__, t.second, t.first.c_str());
  1426. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1427. }
  1428. }
  1429. }
  1430. // find EOM token: "<|eom_id|>"
  1431. if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
  1432. if (false
  1433. || t.first == "<|eom_id|>"
  1434. ) {
  1435. vocab.special_eom_id = t.second;
  1436. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1437. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1438. __func__, t.second, t.first.c_str());
  1439. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1440. }
  1441. }
  1442. }
  1443. // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
  1444. if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
  1445. if (false
  1446. || t.first == "<|fim_prefix|>" // Qwen
  1447. || t.first == "<fim-prefix>"
  1448. || t.first == "<|fim▁begin|>" // DeepSeek
  1449. || t.first == "<PRE>"
  1450. ) {
  1451. vocab.special_fim_pre_id = t.second;
  1452. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1453. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1454. __func__, t.second, t.first.c_str());
  1455. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1456. }
  1457. }
  1458. }
  1459. // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
  1460. if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
  1461. if (false
  1462. || t.first == "<|fim_suffix|>" // Qwen
  1463. || t.first == "<fim-suffix>"
  1464. || t.first == "<|fim▁hole|>" // DeepSeek
  1465. || t.first == "<SUF>"
  1466. ) {
  1467. vocab.special_fim_suf_id = t.second;
  1468. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1469. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1470. __func__, t.second, t.first.c_str());
  1471. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1472. }
  1473. }
  1474. }
  1475. // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
  1476. if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
  1477. if (false
  1478. || t.first == "<|fim_middle|>" // Qwen
  1479. || t.first == "<fim-middle>"
  1480. || t.first == "<|fim▁end|>" // DeepSeek
  1481. || t.first == "<MID>"
  1482. ) {
  1483. vocab.special_fim_mid_id = t.second;
  1484. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1485. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1486. __func__, t.second, t.first.c_str());
  1487. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1488. }
  1489. }
  1490. }
  1491. // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
  1492. if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
  1493. if (false
  1494. || t.first == "<|fim_pad|>" // Qwen
  1495. || t.first == "<fim-pad>"
  1496. || t.first == "<PAD>"
  1497. ) {
  1498. vocab.special_fim_pad_id = t.second;
  1499. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1500. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1501. __func__, t.second, t.first.c_str());
  1502. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1503. }
  1504. }
  1505. }
  1506. // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
  1507. if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
  1508. if (false
  1509. || t.first == "<|fim_repo|>" // Qwen
  1510. || t.first == "<|repo_name|>"
  1511. || t.first == "<fim-repo>"
  1512. || t.first == "<REPO>"
  1513. ) {
  1514. vocab.special_fim_rep_id = t.second;
  1515. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1516. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1517. __func__, t.second, t.first.c_str());
  1518. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1519. }
  1520. }
  1521. }
  1522. // find FIM_SEP token: "<|file_sep|>"
  1523. if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
  1524. if (false
  1525. || t.first == "<|file_sep|>" // Qwen
  1526. ) {
  1527. vocab.special_fim_sep_id = t.second;
  1528. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1529. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1530. __func__, t.second, t.first.c_str());
  1531. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1532. }
  1533. }
  1534. }
  1535. }
  1536. // maintain a list of tokens that cause end-of-generation
  1537. // this is currently determined based on the token text, which is obviously not ideal
  1538. // ref: https://github.com/ggerganov/llama.cpp/issues/9606
  1539. vocab.special_eog_ids.clear();
  1540. if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
  1541. vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
  1542. }
  1543. if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
  1544. vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
  1545. }
  1546. if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
  1547. vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
  1548. }
  1549. for (const auto & t : vocab.token_to_id) {
  1550. if (false
  1551. || t.first == "<|eot_id|>"
  1552. || t.first == "<|im_end|>"
  1553. || t.first == "<|end|>"
  1554. || t.first == "<end_of_turn>"
  1555. || t.first == "<|endoftext|>"
  1556. || t.first == "<|eom_id|>"
  1557. || t.first == "<EOT>"
  1558. ) {
  1559. vocab.special_eog_ids.insert(t.second);
  1560. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1561. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1562. __func__, t.second, t.first.c_str());
  1563. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1564. }
  1565. } else {
  1566. // token is control, but not marked as EOG -> print a debug log
  1567. if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
  1568. LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
  1569. __func__, t.second, t.first.c_str());
  1570. }
  1571. }
  1572. }
  1573. // sanity checks
  1574. if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
  1575. vocab.special_eog_ids.insert(vocab.special_eos_id);
  1576. LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1577. }
  1578. if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
  1579. vocab.special_eog_ids.insert(vocab.special_eot_id);
  1580. LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1581. }
  1582. if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
  1583. vocab.special_eog_ids.insert(vocab.special_eom_id);
  1584. LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1585. }
  1586. }
  1587. // build special tokens cache
  1588. {
  1589. for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
  1590. if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
  1591. vocab.cache_special_tokens.push_back(id);
  1592. }
  1593. }
  1594. std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
  1595. [&] (const llama_vocab::id a, const llama_vocab::id b) {
  1596. return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
  1597. }
  1598. );
  1599. LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
  1600. }
  1601. // build token to piece cache
  1602. {
  1603. size_t size_cache = 0;
  1604. std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
  1605. for (uint32_t id = 0; id < n_vocab; ++id) {
  1606. cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
  1607. size_cache += cache_token_to_piece[id].size();
  1608. }
  1609. std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
  1610. LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
  1611. }
  1612. // Handle per token attributes
  1613. //NOTE: Each model customizes per token attributes.
  1614. //NOTE: Per token attributes are missing from the GGUF file.
  1615. //TODO: Extract attributes from GGUF file.
  1616. {
  1617. auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
  1618. for (auto substr : substrs) {
  1619. if (str.find(substr) < std::string::npos) {
  1620. return true;
  1621. }
  1622. }
  1623. return false;
  1624. };
  1625. auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
  1626. uint32_t current = vocab.id_to_token.at(id).attr;
  1627. current = value ? (current | attr) : (current & ~attr);
  1628. vocab.id_to_token[id].attr = (llama_token_attr) current;
  1629. };
  1630. auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
  1631. _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
  1632. };
  1633. std::string model_name;
  1634. std::string tokenizer_pre;
  1635. ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
  1636. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1637. // model name to lowercase
  1638. std::transform(model_name.begin(), model_name.end(), model_name.begin(),
  1639. [] (const std::string::value_type x) {
  1640. return std::tolower(x);
  1641. }
  1642. );
  1643. // set attributes by model/tokenizer name
  1644. if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
  1645. _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
  1646. } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
  1647. for (auto id : vocab.cache_special_tokens) {
  1648. _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1649. }
  1650. for (auto token : {"</s>"}) {
  1651. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1652. }
  1653. for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
  1654. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
  1655. }
  1656. }
  1657. }
  1658. }
  1659. void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
  1660. const auto & hparams = model.hparams;
  1661. const auto & vocab = model.vocab;
  1662. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  1663. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  1664. bool is_var = false;
  1665. std::vector<uint32_t> v;
  1666. for (uint32_t i = 0; i < n; ++i) {
  1667. v.push_back(f(i));
  1668. if (v[i] != v[0]) {
  1669. is_var = true;
  1670. }
  1671. }
  1672. std::stringstream ss;
  1673. if (is_var) {
  1674. ss << "[";
  1675. for (uint32_t i = 0; i < n; ++i) {
  1676. ss << v[i];
  1677. if (i < n - 1) {
  1678. ss << ", ";
  1679. }
  1680. }
  1681. ss << "]";
  1682. } else {
  1683. ss << v[0];
  1684. }
  1685. return ss.str();
  1686. };
  1687. // hparams
  1688. LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
  1689. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
  1690. LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
  1691. LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
  1692. LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
  1693. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  1694. if (!hparams.vocab_only) {
  1695. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  1696. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  1697. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  1698. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  1699. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  1700. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  1701. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  1702. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  1703. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  1704. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  1705. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  1706. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  1707. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  1708. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  1709. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  1710. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  1711. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  1712. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  1713. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  1714. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  1715. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  1716. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  1717. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  1718. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  1719. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  1720. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  1721. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  1722. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  1723. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  1724. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  1725. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  1726. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  1727. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  1728. }
  1729. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
  1730. LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
  1731. if (ml.n_elements >= 1e12) {
  1732. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
  1733. } else if (ml.n_elements >= 1e9) {
  1734. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
  1735. } else if (ml.n_elements >= 1e6) {
  1736. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
  1737. } else {
  1738. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
  1739. }
  1740. if (ml.n_bytes < GiB) {
  1741. LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1742. } else {
  1743. LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1744. }
  1745. // general kv
  1746. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
  1747. // special tokens
  1748. if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
  1749. if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
  1750. if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
  1751. if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
  1752. if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
  1753. if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
  1754. if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
  1755. if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
  1756. if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
  1757. if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
  1758. if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
  1759. if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
  1760. if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
  1761. if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
  1762. if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
  1763. if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
  1764. for (const auto & id : vocab.special_eog_ids) {
  1765. LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
  1766. }
  1767. LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
  1768. if (model.arch == LLM_ARCH_DEEPSEEK) {
  1769. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1770. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1771. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1772. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1773. }
  1774. if (model.arch == LLM_ARCH_DEEPSEEK2) {
  1775. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1776. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  1777. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  1778. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1779. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1780. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1781. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  1782. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
  1783. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  1784. }
  1785. if (model.arch == LLM_ARCH_QWEN2MOE) {
  1786. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1787. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  1788. }
  1789. if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
  1790. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  1791. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  1792. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  1793. }
  1794. }
  1795. //
  1796. // interface implementation
  1797. //
  1798. struct llama_model_params llama_model_default_params() {
  1799. struct llama_model_params result = {
  1800. /*.devices =*/ nullptr,
  1801. /*.n_gpu_layers =*/ 0,
  1802. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  1803. /*.main_gpu =*/ 0,
  1804. /*.tensor_split =*/ nullptr,
  1805. /*.rpc_servers =*/ nullptr,
  1806. /*.progress_callback =*/ nullptr,
  1807. /*.progress_callback_user_data =*/ nullptr,
  1808. /*.kv_overrides =*/ nullptr,
  1809. /*.vocab_only =*/ false,
  1810. /*.use_mmap =*/ true,
  1811. /*.use_mlock =*/ false,
  1812. /*.check_tensors =*/ false,
  1813. };
  1814. #ifdef GGML_USE_METAL
  1815. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  1816. result.n_gpu_layers = 999;
  1817. #endif
  1818. return result;
  1819. }
  1820. void llama_free_model(struct llama_model * model) {
  1821. delete model;
  1822. }
  1823. enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
  1824. return model->vocab.type;
  1825. }
  1826. int32_t llama_n_vocab(const struct llama_model * model) {
  1827. return model->hparams.n_vocab;
  1828. }
  1829. int32_t llama_n_ctx_train(const struct llama_model * model) {
  1830. return model->hparams.n_ctx_train;
  1831. }
  1832. int32_t llama_n_embd(const struct llama_model * model) {
  1833. return model->hparams.n_embd;
  1834. }
  1835. int32_t llama_n_layer(const struct llama_model * model) {
  1836. return model->hparams.n_layer;
  1837. }
  1838. int32_t llama_n_head(const struct llama_model * model) {
  1839. return model->hparams.n_head();
  1840. }
  1841. enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  1842. switch (model->arch) {
  1843. // these models do not use RoPE
  1844. case LLM_ARCH_GPT2:
  1845. case LLM_ARCH_GPTJ:
  1846. case LLM_ARCH_MPT:
  1847. case LLM_ARCH_REFACT:
  1848. case LLM_ARCH_BLOOM:
  1849. case LLM_ARCH_MAMBA:
  1850. case LLM_ARCH_JINA_BERT_V2:
  1851. case LLM_ARCH_T5:
  1852. case LLM_ARCH_T5ENCODER:
  1853. case LLM_ARCH_JAIS:
  1854. case LLM_ARCH_RWKV6:
  1855. case LLM_ARCH_WAVTOKENIZER_DEC:
  1856. return LLAMA_ROPE_TYPE_NONE;
  1857. // use what we call a normal RoPE, operating on pairs of consecutive head values
  1858. case LLM_ARCH_LLAMA:
  1859. case LLM_ARCH_MLLAMA:
  1860. case LLM_ARCH_DECI:
  1861. case LLM_ARCH_BAICHUAN:
  1862. case LLM_ARCH_STARCODER:
  1863. case LLM_ARCH_PLAMO:
  1864. case LLM_ARCH_ORION:
  1865. case LLM_ARCH_INTERNLM2:
  1866. case LLM_ARCH_MINICPM:
  1867. case LLM_ARCH_XVERSE:
  1868. case LLM_ARCH_COMMAND_R:
  1869. case LLM_ARCH_COHERE2:
  1870. case LLM_ARCH_OLMO:
  1871. case LLM_ARCH_ARCTIC:
  1872. case LLM_ARCH_DEEPSEEK:
  1873. case LLM_ARCH_DEEPSEEK2:
  1874. case LLM_ARCH_CHATGLM:
  1875. case LLM_ARCH_GRANITE:
  1876. case LLM_ARCH_GRANITE_MOE:
  1877. case LLM_ARCH_CHAMELEON:
  1878. case LLM_ARCH_SOLAR:
  1879. return LLAMA_ROPE_TYPE_NORM;
  1880. // the pairs of head values are offset by n_rot/2
  1881. case LLM_ARCH_FALCON:
  1882. case LLM_ARCH_GROK:
  1883. case LLM_ARCH_DBRX:
  1884. case LLM_ARCH_BERT:
  1885. case LLM_ARCH_NOMIC_BERT:
  1886. case LLM_ARCH_STABLELM:
  1887. case LLM_ARCH_BITNET:
  1888. case LLM_ARCH_QWEN:
  1889. case LLM_ARCH_QWEN2:
  1890. case LLM_ARCH_QWEN2MOE:
  1891. case LLM_ARCH_OLMO2:
  1892. case LLM_ARCH_OLMOE:
  1893. case LLM_ARCH_PHI2:
  1894. case LLM_ARCH_PHI3:
  1895. case LLM_ARCH_GEMMA:
  1896. case LLM_ARCH_GEMMA2:
  1897. case LLM_ARCH_STARCODER2:
  1898. case LLM_ARCH_OPENELM:
  1899. case LLM_ARCH_GPTNEOX:
  1900. case LLM_ARCH_CODESHELL:
  1901. case LLM_ARCH_NEMOTRON:
  1902. case LLM_ARCH_EXAONE:
  1903. case LLM_ARCH_MINICPM3:
  1904. return LLAMA_ROPE_TYPE_NEOX;
  1905. case LLM_ARCH_QWEN2VL:
  1906. return LLAMA_ROPE_TYPE_MROPE;
  1907. // all model arches should be listed explicitly here
  1908. case LLM_ARCH_UNKNOWN:
  1909. GGML_ABORT("unknown architecture");
  1910. }
  1911. return LLAMA_ROPE_TYPE_NONE;
  1912. }
  1913. float llama_rope_freq_scale_train(const struct llama_model * model) {
  1914. return model->hparams.rope_freq_scale_train;
  1915. }
  1916. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  1917. const auto & it = model->gguf_kv.find(key);
  1918. if (it == model->gguf_kv.end()) {
  1919. if (buf_size > 0) {
  1920. buf[0] = '\0';
  1921. }
  1922. return -1;
  1923. }
  1924. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1925. }
  1926. int32_t llama_model_meta_count(const struct llama_model * model) {
  1927. return (int)model->gguf_kv.size();
  1928. }
  1929. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  1930. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1931. if (buf_size > 0) {
  1932. buf[0] = '\0';
  1933. }
  1934. return -1;
  1935. }
  1936. auto it = model->gguf_kv.begin();
  1937. std::advance(it, i);
  1938. return snprintf(buf, buf_size, "%s", it->first.c_str());
  1939. }
  1940. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  1941. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1942. if (buf_size > 0) {
  1943. buf[0] = '\0';
  1944. }
  1945. return -1;
  1946. }
  1947. auto it = model->gguf_kv.begin();
  1948. std::advance(it, i);
  1949. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1950. }
  1951. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  1952. return snprintf(buf, buf_size, "%s %s %s",
  1953. llama_model_arch_name (*model).c_str(),
  1954. llama_model_type_name (*model).c_str(),
  1955. llama_model_ftype_name(*model).c_str());
  1956. }
  1957. uint64_t llama_model_size(const struct llama_model * model) {
  1958. return model->n_bytes;
  1959. }
  1960. uint64_t llama_model_n_params(const struct llama_model * model) {
  1961. return model->n_elements;
  1962. }
  1963. bool llama_model_has_encoder(const struct llama_model * model) {
  1964. switch (model->arch) {
  1965. case LLM_ARCH_T5: return true;
  1966. case LLM_ARCH_T5ENCODER: return true;
  1967. default: return false;
  1968. }
  1969. }
  1970. bool llama_model_has_decoder(const struct llama_model * model) {
  1971. switch (model->arch) {
  1972. case LLM_ARCH_T5ENCODER: return false;
  1973. default: return true;
  1974. }
  1975. }
  1976. llama_token llama_model_decoder_start_token(const struct llama_model * model) {
  1977. return model->hparams.dec_start_token_id;
  1978. }
  1979. bool llama_model_is_recurrent(const struct llama_model * model) {
  1980. switch (model->arch) {
  1981. case LLM_ARCH_MAMBA: return true;
  1982. case LLM_ARCH_RWKV6: return true;
  1983. default: return false;
  1984. }
  1985. }