llama-model.cpp 224 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include "llama-model-loader.h"
  5. #include "ggml-cpp.h"
  6. #include <algorithm>
  7. #include <cassert>
  8. #include <cstring>
  9. #include <functional>
  10. #include <map>
  11. #include <sstream>
  12. #include <stdexcept>
  13. const char * llm_type_name(llm_type type) {
  14. switch (type) {
  15. case LLM_TYPE_14M: return "14M";
  16. case LLM_TYPE_17M: return "17M";
  17. case LLM_TYPE_22M: return "22M";
  18. case LLM_TYPE_33M: return "33M";
  19. case LLM_TYPE_60M: return "60M";
  20. case LLM_TYPE_70M: return "70M";
  21. case LLM_TYPE_80M: return "80M";
  22. case LLM_TYPE_109M: return "109M";
  23. case LLM_TYPE_137M: return "137M";
  24. case LLM_TYPE_160M: return "160M";
  25. case LLM_TYPE_220M: return "220M";
  26. case LLM_TYPE_250M: return "250M";
  27. case LLM_TYPE_270M: return "270M";
  28. case LLM_TYPE_335M: return "335M";
  29. case LLM_TYPE_410M: return "410M";
  30. case LLM_TYPE_450M: return "450M";
  31. case LLM_TYPE_770M: return "770M";
  32. case LLM_TYPE_780M: return "780M";
  33. case LLM_TYPE_0_5B: return "0.5B";
  34. case LLM_TYPE_1B: return "1B";
  35. case LLM_TYPE_1_3B: return "1.3B";
  36. case LLM_TYPE_1_4B: return "1.4B";
  37. case LLM_TYPE_1_5B: return "1.5B";
  38. case LLM_TYPE_1_6B: return "1.6B";
  39. case LLM_TYPE_2B: return "2B";
  40. case LLM_TYPE_2_8B: return "2.8B";
  41. case LLM_TYPE_3B: return "3B";
  42. case LLM_TYPE_4B: return "4B";
  43. case LLM_TYPE_6B: return "6B";
  44. case LLM_TYPE_6_9B: return "6.9B";
  45. case LLM_TYPE_7B: return "7B";
  46. case LLM_TYPE_8B: return "8B";
  47. case LLM_TYPE_9B: return "9B";
  48. case LLM_TYPE_11B: return "11B";
  49. case LLM_TYPE_12B: return "12B";
  50. case LLM_TYPE_13B: return "13B";
  51. case LLM_TYPE_14B: return "14B";
  52. case LLM_TYPE_15B: return "15B";
  53. case LLM_TYPE_16B: return "16B";
  54. case LLM_TYPE_20B: return "20B";
  55. case LLM_TYPE_30B: return "30B";
  56. case LLM_TYPE_32B: return "32B";
  57. case LLM_TYPE_34B: return "34B";
  58. case LLM_TYPE_35B: return "35B";
  59. case LLM_TYPE_40B: return "40B";
  60. case LLM_TYPE_65B: return "65B";
  61. case LLM_TYPE_70B: return "70B";
  62. case LLM_TYPE_236B: return "236B";
  63. case LLM_TYPE_314B: return "314B";
  64. case LLM_TYPE_671B: return "671B";
  65. case LLM_TYPE_SMALL: return "0.1B";
  66. case LLM_TYPE_MEDIUM: return "0.4B";
  67. case LLM_TYPE_LARGE: return "0.8B";
  68. case LLM_TYPE_XL: return "1.5B";
  69. case LLM_TYPE_A1_7B: return "A1.7B";
  70. case LLM_TYPE_A2_7B: return "A2.7B";
  71. case LLM_TYPE_8x7B: return "8x7B";
  72. case LLM_TYPE_8x22B: return "8x22B";
  73. case LLM_TYPE_16x12B: return "16x12B";
  74. case LLM_TYPE_16x3_8B: return "16x3.8B";
  75. case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
  76. case LLM_TYPE_57B_A14B: return "57B.A14B";
  77. case LLM_TYPE_27B: return "27B";
  78. default: return "?B";
  79. }
  80. }
  81. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  82. switch (type) {
  83. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  84. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  85. default: return "unknown";
  86. }
  87. }
  88. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  89. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  90. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  91. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  92. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  93. };
  94. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  95. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  96. if (kv.second == name) {
  97. return (llama_rope_scaling_type) kv.first;
  98. }
  99. }
  100. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  101. }
  102. // checks if the weight tensor can be used with the specified buffer type and device
  103. static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
  104. GGML_ASSERT(w != nullptr);
  105. if (op == GGML_OP_NONE) {
  106. return true;
  107. }
  108. ggml_init_params params = {
  109. /*.mem_size =*/ ggml_tensor_overhead()*8,
  110. /*.mem_buffer =*/ NULL,
  111. /*.no_alloc =*/ true,
  112. };
  113. ggml_context_ptr ctx_ptr { ggml_init(params) };
  114. if (!ctx_ptr) {
  115. throw std::runtime_error(format("failed to create ggml context"));
  116. }
  117. ggml_context * ctx = ctx_ptr.get();
  118. ggml_tensor * op_tensor = nullptr;
  119. switch (op) {
  120. case GGML_OP_GET_ROWS:
  121. {
  122. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  123. op_tensor = ggml_get_rows(ctx, w, b);
  124. } break;
  125. case GGML_OP_MUL_MAT:
  126. {
  127. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
  128. op_tensor = ggml_mul_mat(ctx, w, b);
  129. } break;
  130. case GGML_OP_MUL_MAT_ID:
  131. {
  132. int n_expert_used = hparams.n_expert_used;
  133. ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
  134. ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
  135. op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
  136. } break;
  137. case GGML_OP_ADD:
  138. {
  139. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  140. op_tensor = ggml_add(ctx, a, w);
  141. } break;
  142. case GGML_OP_MUL:
  143. {
  144. ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
  145. op_tensor = ggml_mul(ctx, a, w);
  146. } break;
  147. case GGML_OP_DIV:
  148. {
  149. ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
  150. op_tensor = ggml_div(ctx, a, w);
  151. } break;
  152. case GGML_OP_ROPE:
  153. {
  154. int n_embd_head = hparams.n_embd_head_v;
  155. int n_head = hparams.n_head();
  156. ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
  157. ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
  158. op_tensor = ggml_rope_ext(
  159. ctx, a, b, w,
  160. 0, 0, 0, 0, 0,
  161. 0, 0, 0, 0
  162. );
  163. } break;
  164. case GGML_OP_SSM_CONV:
  165. {
  166. // FIXME
  167. ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
  168. op_tensor = ggml_ssm_conv(ctx, conv_x, w);
  169. } break;
  170. case GGML_OP_SSM_SCAN:
  171. {
  172. // FIXME
  173. const int64_t d_state = w->ne[0];
  174. const int64_t d_inner = w->ne[1];
  175. const int64_t n_seq_tokens = 512;
  176. const int64_t n_seqs = 1;
  177. ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
  178. ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  179. ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
  180. ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  181. ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
  182. op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
  183. } break;
  184. case GGML_OP_RWKV_WKV6:
  185. {
  186. // FIXME
  187. const int64_t S = 123;
  188. const int64_t H = 123;
  189. const int64_t n_tokens = 123;
  190. const int64_t n_seqs = 123;
  191. ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  192. ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  193. ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  194. ggml_tensor * tf = w;
  195. ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
  196. ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
  197. op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
  198. } break;
  199. case GGML_OP_IM2COL:
  200. {
  201. const int n_embd = hparams.n_embd;
  202. ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
  203. op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
  204. } break;
  205. default:
  206. GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
  207. }
  208. // create a temporary dummy buffer for the weight so that supports_op can check the buffer type
  209. GGML_ASSERT(w->buffer == nullptr);
  210. w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
  211. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  212. ggml_backend_buffer_free(w->buffer);
  213. w->buffer = nullptr;
  214. return op_supported;
  215. }
  216. // lists of buffer types used for each layer
  217. using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
  218. // find the first buffer type in the list that can use the tensor
  219. static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
  220. GGML_ASSERT(!buft_list.empty());
  221. for (const auto & cur : buft_list) {
  222. ggml_backend_dev_t cur_dev = cur.first;
  223. ggml_backend_buffer_type_t cur_buft = cur.second;
  224. if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
  225. return cur_buft;
  226. }
  227. }
  228. return nullptr;
  229. }
  230. // CPU: ACCEL -> CPU extra -> GPU host -> CPU
  231. static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
  232. buft_list_t buft_list;
  233. // add ACCEL buffer types
  234. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  235. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  236. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  237. auto * buft = ggml_backend_dev_buffer_type(dev);
  238. // skip
  239. if (buft != ggml_backend_cpu_buffer_type()) {
  240. buft_list.emplace_back(dev, buft);
  241. }
  242. }
  243. }
  244. // add extra buffer types
  245. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  246. auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
  247. auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
  248. ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
  249. if (ggml_backend_dev_get_extra_bufts_fn) {
  250. ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
  251. while (extra_bufts && *extra_bufts) {
  252. buft_list.emplace_back(cpu_dev, *extra_bufts);
  253. ++extra_bufts;
  254. }
  255. }
  256. // add a host buffer type
  257. // storing the tensors in a host buffer is useful when the processing of large batches
  258. // is offloaded to a GPU device, since it reduces the time spent on data transfers
  259. // generally, this will be done using the first device in the list
  260. // a better approach would be to handle this on a weight-by-weight basis using the offload_op
  261. // function of the device to determine if it would benefit from being stored in a host buffer
  262. for (auto * dev : devices) {
  263. ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
  264. if (buft) {
  265. buft_list.emplace_back(dev, buft);
  266. break;
  267. }
  268. }
  269. // add the CPU buffer type
  270. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  271. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  272. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
  273. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  274. }
  275. }
  276. return buft_list;
  277. }
  278. // GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
  279. static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
  280. buft_list_t buft_list;
  281. // add the device split buffer type if requested and available
  282. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  283. ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
  284. auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
  285. ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
  286. if (ggml_backend_split_buffer_type_fn) {
  287. size_t dev_index = [&]() {
  288. auto * reg = ggml_backend_dev_backend_reg(dev);
  289. for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
  290. if (ggml_backend_reg_dev_get(reg, i) == dev) {
  291. return i;
  292. }
  293. }
  294. throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
  295. }();
  296. auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
  297. if (buft != nullptr) {
  298. buft_list.emplace_back(dev, buft);
  299. }
  300. }
  301. }
  302. // add the device default buffer type
  303. buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
  304. return buft_list;
  305. }
  306. struct llama_model::impl {
  307. impl() {}
  308. ~impl() {}
  309. uint64_t n_elements = 0;
  310. size_t n_bytes = 0;
  311. std::string desc_str;
  312. // model memory mapped files
  313. llama_mmaps mappings;
  314. // objects representing data potentially being locked in memory
  315. llama_mlocks mlock_bufs;
  316. llama_mlocks mlock_mmaps;
  317. // contexts where the model tensors metadata is stored
  318. std::vector<ggml_context_ptr> ctxs;
  319. // the model memory buffers for the tensor data
  320. std::vector<ggml_backend_buffer_ptr> bufs;
  321. buft_list_t cpu_buft_list;
  322. std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
  323. struct layer_dev {
  324. ggml_backend_dev_t dev;
  325. buft_list_t * buft_list;
  326. };
  327. layer_dev dev_input = {};
  328. layer_dev dev_output = {};
  329. std::vector<layer_dev> dev_layer;
  330. };
  331. llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
  332. }
  333. llama_model::~llama_model() {}
  334. void llama_model::load_stats(llama_model_loader & ml) {
  335. pimpl->n_elements = ml.n_elements;
  336. pimpl->n_bytes = ml.n_bytes;
  337. }
  338. void llama_model::load_arch(llama_model_loader & ml) {
  339. arch = ml.get_arch();
  340. if (arch == LLM_ARCH_UNKNOWN) {
  341. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  342. }
  343. }
  344. void llama_model::load_hparams(llama_model_loader & ml) {
  345. const gguf_context * ctx = ml.meta.get();
  346. // get metadata as string
  347. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  348. enum gguf_type type = gguf_get_kv_type(ctx, i);
  349. if (type == GGUF_TYPE_ARRAY) {
  350. continue;
  351. }
  352. const char * name = gguf_get_key(ctx, i);
  353. const std::string value = gguf_kv_to_str(ctx, i);
  354. gguf_kv.emplace(name, value);
  355. }
  356. // get general kv
  357. ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  358. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
  359. // everything past this point is not vocab-related
  360. if (hparams.vocab_only) {
  361. return;
  362. }
  363. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  364. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  365. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  366. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  367. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  368. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false);
  369. if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  370. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  371. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  372. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  373. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  374. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  375. }
  376. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  377. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  378. if (hparams.n_expert > 0) {
  379. GGML_ASSERT(hparams.n_expert_used > 0);
  380. } else {
  381. GGML_ASSERT(hparams.n_expert_used == 0);
  382. }
  383. // zero-out the array hparams
  384. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  385. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  386. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  387. std::fill(hparams.cross_attn_layers.begin(), hparams.cross_attn_layers.end(), -1);
  388. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  389. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  390. ml.get_arr(LLM_KV_ATTENTION_CROSS_ATTENTION_LAYERS, hparams.cross_attn_layers, false);
  391. // n_head_kv is optional, default to n_head
  392. hparams.n_head_kv_arr = hparams.n_head_arr;
  393. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  394. bool rope_finetuned = false;
  395. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  396. hparams.rope_finetuned = rope_finetuned;
  397. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  398. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  399. // rope_freq_base (optional)
  400. hparams.rope_freq_base_train = 10000.0f;
  401. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  402. std::string rope_scaling("linear");
  403. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  404. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  405. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  406. // rope_freq_scale (inverse of the kv) is optional
  407. float ropescale = 0.0f;
  408. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  409. // try the old key name
  410. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  411. }
  412. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  413. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  414. // non-transformer models do not have attention heads
  415. if (hparams.n_head() > 0) {
  416. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  417. // gpt-j n_rot = rotary_dim
  418. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  419. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  420. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  421. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  422. // sanity check for n_rot (optional)
  423. hparams.n_rot = hparams.n_embd_head_k;
  424. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  425. if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_MLLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
  426. if (hparams.n_rot != hparams.n_embd_head_k) {
  427. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  428. }
  429. }
  430. } else {
  431. hparams.n_rot = 0;
  432. hparams.n_embd_head_k = 0;
  433. hparams.n_embd_head_v = 0;
  434. }
  435. // for differentiating model types
  436. uint32_t n_vocab = 0;
  437. ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
  438. // arch-specific KVs
  439. switch (arch) {
  440. case LLM_ARCH_LLAMA:
  441. {
  442. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  443. if (hparams.n_expert == 8) {
  444. switch (hparams.n_layer) {
  445. case 32: type = LLM_TYPE_8x7B; break;
  446. case 56: type = LLM_TYPE_8x22B; break;
  447. default: type = LLM_TYPE_UNKNOWN;
  448. }
  449. } else {
  450. switch (hparams.n_layer) {
  451. case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
  452. case 22: type = LLM_TYPE_1B; break;
  453. case 26: type = LLM_TYPE_3B; break;
  454. case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
  455. // granite uses a vocab with len 49152
  456. case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
  457. case 36: type = LLM_TYPE_8B; break; // granite
  458. case 40: type = LLM_TYPE_13B; break;
  459. case 48: type = LLM_TYPE_34B; break;
  460. case 60: type = LLM_TYPE_30B; break;
  461. case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
  462. default: type = LLM_TYPE_UNKNOWN;
  463. }
  464. }
  465. } break;
  466. case LLM_ARCH_MLLAMA:
  467. {
  468. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  469. switch (hparams.n_layer) {
  470. case 40: type = LLM_TYPE_11B; break;
  471. case 100: type = LLM_TYPE_90B; break;
  472. default: type = LLM_TYPE_UNKNOWN;
  473. }
  474. } break;
  475. case LLM_ARCH_DECI:
  476. {
  477. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  478. switch (hparams.n_layer) {
  479. case 32: type = LLM_TYPE_7B; break;
  480. case 80: type = LLM_TYPE_70B; break;
  481. default: type = LLM_TYPE_UNKNOWN;
  482. }
  483. } break;
  484. case LLM_ARCH_MINICPM:
  485. {
  486. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  487. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  488. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  489. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  490. switch (hparams.n_layer) {
  491. case 52: type = LLM_TYPE_1B; break;
  492. case 40: type = LLM_TYPE_2B; break;
  493. default: type = LLM_TYPE_UNKNOWN;
  494. }
  495. } break;
  496. case LLM_ARCH_MINICPM3:
  497. {
  498. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  499. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  500. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  501. switch (hparams.n_layer) {
  502. case 62: type = LLM_TYPE_4B; break;
  503. default: type = LLM_TYPE_UNKNOWN;
  504. }
  505. } break;
  506. case LLM_ARCH_GROK:
  507. {
  508. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  509. switch (hparams.n_layer) {
  510. case 64: type = LLM_TYPE_314B; break;
  511. default: type = LLM_TYPE_UNKNOWN;
  512. }
  513. } break;
  514. case LLM_ARCH_FALCON:
  515. {
  516. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  517. switch (hparams.n_layer) {
  518. case 32: type = LLM_TYPE_7B; break;
  519. case 60: type = LLM_TYPE_40B; break;
  520. default: type = LLM_TYPE_UNKNOWN;
  521. }
  522. } break;
  523. case LLM_ARCH_BAICHUAN:
  524. {
  525. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  526. switch (hparams.n_layer) {
  527. case 32: type = LLM_TYPE_7B; break;
  528. case 40: type = LLM_TYPE_13B; break;
  529. default: type = LLM_TYPE_UNKNOWN;
  530. }
  531. if (type == LLM_TYPE_13B) {
  532. // TODO: become GGUF KV parameter
  533. hparams.f_max_alibi_bias = 8.0f;
  534. }
  535. } break;
  536. case LLM_ARCH_STARCODER:
  537. {
  538. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  539. switch (hparams.n_layer) {
  540. case 24: type = LLM_TYPE_1B; break;
  541. case 36: type = LLM_TYPE_3B; break;
  542. case 42: type = LLM_TYPE_7B; break;
  543. case 40: type = LLM_TYPE_15B; break;
  544. default: type = LLM_TYPE_UNKNOWN;
  545. }
  546. } break;
  547. case LLM_ARCH_REFACT:
  548. {
  549. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  550. switch (hparams.n_layer) {
  551. case 32: type = LLM_TYPE_1B; break;
  552. default: type = LLM_TYPE_UNKNOWN;
  553. }
  554. // TODO: become GGUF KV parameter
  555. hparams.f_max_alibi_bias = 8.0f;
  556. } break;
  557. case LLM_ARCH_BERT:
  558. {
  559. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  560. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  561. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  562. switch (hparams.n_layer) {
  563. case 3:
  564. type = LLM_TYPE_17M; break; // bge-micro
  565. case 6:
  566. type = LLM_TYPE_22M; break; // MiniLM-L6
  567. case 12:
  568. switch (hparams.n_embd) {
  569. case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
  570. case 768: type = LLM_TYPE_109M; break; // bge-base
  571. default: type = LLM_TYPE_UNKNOWN;
  572. } break;
  573. case 24:
  574. type = LLM_TYPE_335M; break; // bge-large
  575. default: type = LLM_TYPE_UNKNOWN;
  576. }
  577. } break;
  578. case LLM_ARCH_JINA_BERT_V2:
  579. {
  580. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  581. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  582. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  583. hparams.f_max_alibi_bias = 8.0f;
  584. switch (hparams.n_layer) {
  585. case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
  586. case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
  587. default: type = LLM_TYPE_UNKNOWN;
  588. }
  589. } break;
  590. case LLM_ARCH_NOMIC_BERT:
  591. {
  592. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  593. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  594. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  595. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  596. type = LLM_TYPE_137M;
  597. }
  598. } break;
  599. case LLM_ARCH_BLOOM:
  600. {
  601. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  602. switch (hparams.n_layer) {
  603. case 24: type = LLM_TYPE_1B; break;
  604. case 30:
  605. switch (hparams.n_embd) {
  606. case 2560: type = LLM_TYPE_3B; break;
  607. case 4096: type = LLM_TYPE_7B; break;
  608. default: type = LLM_TYPE_UNKNOWN;
  609. } break;
  610. default: type = LLM_TYPE_UNKNOWN;
  611. }
  612. // TODO: become GGUF KV parameter
  613. hparams.f_max_alibi_bias = 8.0f;
  614. } break;
  615. case LLM_ARCH_MPT:
  616. {
  617. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  618. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  619. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  620. switch (hparams.n_layer) {
  621. case 32: type = LLM_TYPE_7B; break;
  622. case 48: type = LLM_TYPE_30B; break;
  623. default: type = LLM_TYPE_UNKNOWN;
  624. }
  625. } break;
  626. case LLM_ARCH_STABLELM:
  627. {
  628. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  629. switch (hparams.n_layer) {
  630. case 24: type = LLM_TYPE_1B; break;
  631. case 32: type = LLM_TYPE_3B; break;
  632. case 40: type = LLM_TYPE_12B; break;
  633. default: type = LLM_TYPE_UNKNOWN;
  634. }
  635. } break;
  636. case LLM_ARCH_QWEN:
  637. {
  638. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  639. switch (hparams.n_layer) {
  640. case 32: type = LLM_TYPE_7B; break;
  641. case 40: type = LLM_TYPE_13B; break;
  642. default: type = LLM_TYPE_UNKNOWN;
  643. }
  644. } break;
  645. case LLM_ARCH_QWEN2VL:
  646. {
  647. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  648. }
  649. // fall through
  650. case LLM_ARCH_QWEN2:
  651. {
  652. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  653. switch (hparams.n_layer) {
  654. case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
  655. case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
  656. case 32: type = LLM_TYPE_7B; break;
  657. case 36: type = LLM_TYPE_3B; break;
  658. case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
  659. case 48: type = LLM_TYPE_14B; break;
  660. case 64: type = LLM_TYPE_32B; break;
  661. case 80: type = LLM_TYPE_70B; break;
  662. default: type = LLM_TYPE_UNKNOWN;
  663. }
  664. } break;
  665. case LLM_ARCH_QWEN2MOE:
  666. {
  667. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  668. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  669. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  670. switch (hparams.n_layer) {
  671. case 24: type = LLM_TYPE_A2_7B; break;
  672. case 28: type = LLM_TYPE_57B_A14B; break;
  673. default: type = LLM_TYPE_UNKNOWN;
  674. }
  675. } break;
  676. case LLM_ARCH_PHI2:
  677. {
  678. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  679. switch (hparams.n_layer) {
  680. case 24: type = LLM_TYPE_1B; break;
  681. case 32: type = LLM_TYPE_3B; break;
  682. default: type = LLM_TYPE_UNKNOWN;
  683. }
  684. } break;
  685. case LLM_ARCH_PHI3:
  686. {
  687. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  688. switch (hparams.n_layer) {
  689. case 24: type = LLM_TYPE_1B; break;
  690. case 32: type = LLM_TYPE_3B; break;
  691. case 40: type = LLM_TYPE_14B; break;
  692. default: type = LLM_TYPE_UNKNOWN;
  693. }
  694. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  695. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  696. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  697. hparams.n_swa = 2047;
  698. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  699. // default value for Phi-3-mini-128k-instruct
  700. hparams.n_swa = 262144;
  701. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  702. // default value for Phi-3-medium-128k-instruct
  703. hparams.n_swa = 131072;
  704. }
  705. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  706. if (!found_swa && hparams.n_swa == 0) {
  707. throw std::runtime_error("invalid value for sliding_window");
  708. }
  709. } break;
  710. case LLM_ARCH_PHIMOE:
  711. {
  712. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  713. switch (hparams.n_layer) {
  714. case 32: type = LLM_TYPE_16x3_8B; break;
  715. default: type = LLM_TYPE_UNKNOWN;
  716. }
  717. } break;
  718. case LLM_ARCH_PLAMO:
  719. {
  720. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  721. switch (hparams.n_layer) {
  722. case 40: type = LLM_TYPE_13B; break;
  723. default: type = LLM_TYPE_UNKNOWN;
  724. }
  725. } break;
  726. case LLM_ARCH_GPT2:
  727. {
  728. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  729. switch (hparams.n_layer) {
  730. case 12: type = LLM_TYPE_SMALL; break;
  731. case 24: type = LLM_TYPE_MEDIUM; break;
  732. case 36: type = LLM_TYPE_LARGE; break;
  733. case 48: type = LLM_TYPE_XL; break;
  734. default: type = LLM_TYPE_UNKNOWN;
  735. }
  736. } break;
  737. case LLM_ARCH_CODESHELL:
  738. {
  739. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  740. switch (hparams.n_layer) {
  741. case 42: type = LLM_TYPE_7B; break;
  742. default: type = LLM_TYPE_UNKNOWN;
  743. }
  744. } break;
  745. case LLM_ARCH_ORION:
  746. {
  747. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  748. switch (hparams.n_layer) {
  749. case 40: type = LLM_TYPE_14B; break;
  750. default: type = LLM_TYPE_UNKNOWN;
  751. }
  752. } break;
  753. case LLM_ARCH_INTERNLM2:
  754. {
  755. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  756. switch (hparams.n_layer) {
  757. case 32: type = LLM_TYPE_7B; break;
  758. case 48: type = LLM_TYPE_20B; break;
  759. default: type = LLM_TYPE_UNKNOWN;
  760. }
  761. } break;
  762. case LLM_ARCH_GEMMA:
  763. {
  764. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  765. switch (hparams.n_layer) {
  766. case 18: type = LLM_TYPE_2B; break;
  767. case 28: type = LLM_TYPE_7B; break;
  768. default: type = LLM_TYPE_UNKNOWN;
  769. }
  770. } break;
  771. case LLM_ARCH_GEMMA2:
  772. {
  773. hparams.n_swa = 4096; // default value of gemma 2
  774. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  775. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  776. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  777. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  778. hparams.attn_soft_cap = true;
  779. switch (hparams.n_layer) {
  780. case 26: type = LLM_TYPE_2B; break;
  781. case 42: type = LLM_TYPE_9B; break;
  782. case 46: type = LLM_TYPE_27B; break;
  783. default: type = LLM_TYPE_UNKNOWN;
  784. }
  785. } break;
  786. case LLM_ARCH_GEMMA3:
  787. {
  788. } break;
  789. case LLM_ARCH_STARCODER2:
  790. {
  791. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  792. switch (hparams.n_layer) {
  793. case 30: type = LLM_TYPE_3B; break;
  794. case 32: type = LLM_TYPE_7B; break;
  795. case 40: type = LLM_TYPE_15B; break;
  796. case 52: type = LLM_TYPE_20B; break; // granite
  797. case 88: type = LLM_TYPE_34B; break; // granite
  798. default: type = LLM_TYPE_UNKNOWN;
  799. }
  800. } break;
  801. case LLM_ARCH_MAMBA:
  802. {
  803. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  804. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  805. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  806. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  807. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  808. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  809. switch (hparams.n_layer) {
  810. case 24:
  811. switch (hparams.n_embd) {
  812. case 768: type = LLM_TYPE_SMALL; break;
  813. default: type = LLM_TYPE_UNKNOWN;
  814. } break;
  815. case 48:
  816. switch (hparams.n_embd) {
  817. case 1024: type = LLM_TYPE_MEDIUM; break;
  818. case 1536: type = LLM_TYPE_LARGE; break;
  819. case 2048: type = LLM_TYPE_XL; break;
  820. default: type = LLM_TYPE_UNKNOWN;
  821. } break;
  822. case 64:
  823. switch (hparams.n_embd) {
  824. case 2560: type = LLM_TYPE_3B; break;
  825. default: type = LLM_TYPE_UNKNOWN;
  826. } break;
  827. default: type = LLM_TYPE_UNKNOWN;
  828. }
  829. } break;
  830. case LLM_ARCH_XVERSE:
  831. {
  832. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  833. switch (hparams.n_layer) {
  834. case 32: type = LLM_TYPE_7B; break;
  835. case 40: type = LLM_TYPE_13B; break;
  836. case 80: type = LLM_TYPE_65B; break;
  837. default: type = LLM_TYPE_UNKNOWN;
  838. }
  839. } break;
  840. case LLM_ARCH_COMMAND_R:
  841. {
  842. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  843. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  844. switch (hparams.n_layer) {
  845. case 40: type = LLM_TYPE_35B; break;
  846. default: type = LLM_TYPE_UNKNOWN;
  847. }
  848. } break;
  849. case LLM_ARCH_COHERE2:
  850. {
  851. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  852. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  853. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  854. switch (hparams.n_layer) {
  855. case 32: type = LLM_TYPE_8B; break;
  856. default: type = LLM_TYPE_UNKNOWN;
  857. }
  858. } break;
  859. case LLM_ARCH_DBRX:
  860. {
  861. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  862. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  863. switch (hparams.n_layer) {
  864. case 40: type = LLM_TYPE_16x12B; break;
  865. default: type = LLM_TYPE_UNKNOWN;
  866. }
  867. } break;
  868. case LLM_ARCH_OLMO:
  869. {
  870. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  871. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  872. switch (hparams.n_layer) {
  873. case 22: type = LLM_TYPE_1B; break;
  874. case 32: type = LLM_TYPE_7B; break;
  875. case 80: type = LLM_TYPE_70B; break;
  876. default: type = LLM_TYPE_UNKNOWN;
  877. }
  878. } break;
  879. case LLM_ARCH_OLMO2:
  880. {
  881. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  882. switch (hparams.n_layer) {
  883. case 16: type = LLM_TYPE_1B; break;
  884. case 32: type = LLM_TYPE_7B; break;
  885. case 40: type = LLM_TYPE_13B; break;
  886. default: type = LLM_TYPE_UNKNOWN;
  887. }
  888. } break;
  889. case LLM_ARCH_OLMOE:
  890. {
  891. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  892. switch (hparams.n_layer) {
  893. case 16: type = LLM_TYPE_A1_7B; break;
  894. default: type = LLM_TYPE_UNKNOWN;
  895. }
  896. } break;
  897. case LLM_ARCH_OPENELM:
  898. {
  899. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  900. switch (hparams.n_layer) {
  901. case 16: type = LLM_TYPE_270M; break;
  902. case 20: type = LLM_TYPE_450M; break;
  903. case 28: type = LLM_TYPE_1B; break;
  904. case 36: type = LLM_TYPE_3B; break;
  905. default: type = LLM_TYPE_UNKNOWN;
  906. }
  907. } break;
  908. case LLM_ARCH_GPTNEOX:
  909. {
  910. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  911. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  912. switch (hparams.n_layer) {
  913. case 6:
  914. switch (hparams.n_ff()) {
  915. case 512: type = LLM_TYPE_14M; break;
  916. case 2048: type = LLM_TYPE_70M; break;
  917. default: type = LLM_TYPE_UNKNOWN;
  918. } break;
  919. case 12:
  920. switch (hparams.n_ff()) {
  921. case 3072: type = LLM_TYPE_160M; break;
  922. default: type = LLM_TYPE_UNKNOWN;
  923. } break;
  924. case 16:
  925. switch (hparams.n_ff()) {
  926. case 8192: type = LLM_TYPE_1B; break;
  927. default: type = LLM_TYPE_UNKNOWN;
  928. } break;
  929. case 24:
  930. switch (hparams.n_ff()) {
  931. case 4096: type = LLM_TYPE_410M; break;
  932. case 8192: type = LLM_TYPE_1_4B; break;
  933. default: type = LLM_TYPE_UNKNOWN;
  934. } break;
  935. case 32:
  936. switch (hparams.n_ff()) {
  937. case 10240: type = LLM_TYPE_2_8B; break;
  938. case 16384: type = LLM_TYPE_6_9B; break;
  939. default: type = LLM_TYPE_UNKNOWN;
  940. } break;
  941. case 36:
  942. switch (hparams.n_ff()) {
  943. case 20480: type = LLM_TYPE_12B; break;
  944. default: type = LLM_TYPE_UNKNOWN;
  945. } break;
  946. case 44:
  947. switch (hparams.n_ff()) {
  948. case 24576: type = LLM_TYPE_20B; break;
  949. default: type = LLM_TYPE_UNKNOWN;
  950. } break;
  951. default: type = LLM_TYPE_UNKNOWN;
  952. }
  953. } break;
  954. case LLM_ARCH_ARCTIC:
  955. {
  956. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  957. if (hparams.n_expert == 128) {
  958. switch (hparams.n_layer) {
  959. case 35: type = LLM_TYPE_10B_128x3_66B; break;
  960. default: type = LLM_TYPE_UNKNOWN;
  961. }
  962. } else {
  963. type = LLM_TYPE_UNKNOWN;
  964. }
  965. } break;
  966. case LLM_ARCH_DEEPSEEK:
  967. {
  968. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  969. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  970. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  971. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  972. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  973. switch (hparams.n_layer) {
  974. case 28: type = LLM_TYPE_20B; break;
  975. default: type = LLM_TYPE_UNKNOWN;
  976. }
  977. } break;
  978. case LLM_ARCH_DEEPSEEK2:
  979. {
  980. bool is_lite = (hparams.n_layer == 27);
  981. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  982. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  983. if (!is_lite) {
  984. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  985. }
  986. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  987. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  988. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  989. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  990. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  991. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  992. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  993. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  994. // that have no expert_gating_func model parameter set
  995. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  996. }
  997. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  998. switch (hparams.n_layer) {
  999. case 27: type = LLM_TYPE_16B; break;
  1000. case 60: type = LLM_TYPE_236B; break;
  1001. case 61: type = LLM_TYPE_671B; break;
  1002. default: type = LLM_TYPE_UNKNOWN;
  1003. }
  1004. } break;
  1005. case LLM_ARCH_CHATGLM:
  1006. {
  1007. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1008. switch (hparams.n_layer) {
  1009. case 28: {
  1010. if (hparams.n_head(0) == 16) {
  1011. type = LLM_TYPE_1_5B;
  1012. } else {
  1013. type = LLM_TYPE_6B;
  1014. }
  1015. } break;
  1016. case 40: {
  1017. if (hparams.n_head(0) == 24) {
  1018. type = LLM_TYPE_4B;
  1019. } else {
  1020. type = LLM_TYPE_9B;
  1021. }
  1022. } break;
  1023. default: type = LLM_TYPE_UNKNOWN;
  1024. }
  1025. } break;
  1026. case LLM_ARCH_BITNET:
  1027. {
  1028. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1029. switch (hparams.n_layer) {
  1030. case 26: type = LLM_TYPE_3B; break;
  1031. default: type = LLM_TYPE_UNKNOWN;
  1032. }
  1033. } break;
  1034. case LLM_ARCH_T5:
  1035. {
  1036. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1037. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1038. uint32_t dec_start_token_id;
  1039. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  1040. hparams.dec_start_token_id = dec_start_token_id;
  1041. }
  1042. switch (hparams.n_layer) {
  1043. case 6: type = LLM_TYPE_60M; break; // t5-small
  1044. case 8: type = LLM_TYPE_80M; break; // flan-t5-small
  1045. case 12:
  1046. switch (hparams.n_ff()) {
  1047. case 3072: type = LLM_TYPE_220M; break; // t5-base
  1048. case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
  1049. default: type = LLM_TYPE_UNKNOWN;
  1050. } break;
  1051. case 24:
  1052. switch (hparams.n_ff()) {
  1053. case 4096: type = LLM_TYPE_770M; break; // t5-large
  1054. case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
  1055. case 16384: type = LLM_TYPE_3B; break; // t5-3b
  1056. case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
  1057. case 65536: type = LLM_TYPE_11B; break; // t5-11b
  1058. case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
  1059. default: type = LLM_TYPE_UNKNOWN;
  1060. } break;
  1061. default: type = LLM_TYPE_UNKNOWN;
  1062. }
  1063. } break;
  1064. case LLM_ARCH_T5ENCODER:
  1065. {
  1066. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1067. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  1068. type = LLM_TYPE_UNKNOWN;
  1069. } break;
  1070. case LLM_ARCH_JAIS:
  1071. {
  1072. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1073. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  1074. switch (hparams.n_layer) {
  1075. case 24: type = LLM_TYPE_1_3B; break;
  1076. case 40: type = LLM_TYPE_13B; break;
  1077. /* TODO: add variants */
  1078. default: type = LLM_TYPE_UNKNOWN;
  1079. }
  1080. } break;
  1081. case LLM_ARCH_NEMOTRON:
  1082. {
  1083. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1084. switch (hparams.n_layer) {
  1085. case 32: type = LLM_TYPE_4B; break;
  1086. default: type = LLM_TYPE_UNKNOWN;
  1087. }
  1088. } break;
  1089. case LLM_ARCH_EXAONE:
  1090. {
  1091. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1092. switch (hparams.n_layer) {
  1093. case 32: type = LLM_TYPE_8B; break;
  1094. default: type = LLM_TYPE_UNKNOWN;
  1095. }
  1096. } break;
  1097. case LLM_ARCH_RWKV6:
  1098. case LLM_ARCH_RWKV6QWEN2:
  1099. {
  1100. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
  1101. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
  1102. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  1103. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  1104. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  1105. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  1106. ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
  1107. switch (hparams.n_layer) {
  1108. case 24: type = LLM_TYPE_1_6B; break;
  1109. case 32:
  1110. switch (hparams.n_embd) {
  1111. case 2560: type = LLM_TYPE_3B; break;
  1112. case 4096: type = LLM_TYPE_7B; break;
  1113. default: type = LLM_TYPE_UNKNOWN;
  1114. } break;
  1115. case 61: type = LLM_TYPE_14B; break;
  1116. case 64: type = LLM_TYPE_32B; break;
  1117. default: type = LLM_TYPE_UNKNOWN;
  1118. }
  1119. } break;
  1120. case LLM_ARCH_GRANITE:
  1121. case LLM_ARCH_GRANITE_MOE:
  1122. {
  1123. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1124. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  1125. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  1126. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  1127. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  1128. switch (hparams.n_layer) {
  1129. case 32: type = LLM_TYPE_3B; break;
  1130. case 40: type = LLM_TYPE_3B; break;
  1131. // Add additional layer/vocab/etc checks here for other model sizes
  1132. default: type = LLM_TYPE_UNKNOWN;
  1133. }
  1134. } break;
  1135. case LLM_ARCH_CHAMELEON:
  1136. {
  1137. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1138. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  1139. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  1140. switch (hparams.n_layer) {
  1141. case 32: type = LLM_TYPE_7B; break;
  1142. case 48: type = LLM_TYPE_34B; break;
  1143. default: type = LLM_TYPE_UNKNOWN;
  1144. }
  1145. } break;
  1146. case LLM_ARCH_SOLAR:
  1147. {
  1148. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  1149. for (size_t i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
  1150. auto & bskcn = hparams.n_bskcn_arr[i];
  1151. bskcn.fill(0);
  1152. auto kv = LLM_KV(arch);
  1153. ml.get_key_or_arr(format((kv(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION) + ".%d").c_str(), i), bskcn, hparams.n_layer, false);
  1154. }
  1155. switch (hparams.n_layer) {
  1156. case 64: type = LLM_TYPE_22B; break;
  1157. default: type = LLM_TYPE_UNKNOWN;
  1158. }
  1159. } break;
  1160. case LLM_ARCH_WAVTOKENIZER_DEC:
  1161. {
  1162. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1163. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1164. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1165. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1166. } break;
  1167. default: throw std::runtime_error("unsupported model architecture");
  1168. }
  1169. pimpl->n_bytes = ml.n_bytes;
  1170. pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
  1171. if (hparams.f_max_alibi_bias > 0.0f) {
  1172. hparams.use_alibi = true;
  1173. }
  1174. hparams.rope_type = llama_model_rope_type(this);
  1175. }
  1176. void llama_model::load_vocab(llama_model_loader & ml) {
  1177. const auto kv = LLM_KV(arch);
  1178. vocab.load(ml, kv);
  1179. }
  1180. bool llama_model::load_tensors(llama_model_loader & ml) {
  1181. const auto & split_mode = params.split_mode;
  1182. const auto & n_gpu_layers = params.n_gpu_layers;
  1183. const auto & use_mlock = params.use_mlock;
  1184. const auto & tensor_split = params.tensor_split;
  1185. const int n_layer = hparams.n_layer;
  1186. const bool use_mmap_buffer = true;
  1187. LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
  1188. // build a list of buffer types for the CPU and GPU devices
  1189. pimpl->cpu_buft_list = make_cpu_buft_list(devices);
  1190. for (auto * dev : devices) {
  1191. buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
  1192. // add CPU buffer types as a fallback
  1193. buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
  1194. pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
  1195. }
  1196. // calculate the split points
  1197. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
  1198. std::vector<float> splits(n_devices());
  1199. if (all_zero) {
  1200. // default split, by free memory
  1201. for (size_t i = 0; i < n_devices(); ++i) {
  1202. ggml_backend_dev_t dev = devices[i];
  1203. size_t total;
  1204. size_t free;
  1205. ggml_backend_dev_memory(dev, &free, &total);
  1206. splits[i] = free;
  1207. }
  1208. } else {
  1209. std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
  1210. }
  1211. // sum and normalize the splits to get the split points
  1212. float split_sum = 0.0f;
  1213. for (size_t i = 0; i < n_devices(); ++i) {
  1214. split_sum += splits[i];
  1215. splits[i] = split_sum;
  1216. }
  1217. for (size_t i = 0; i < n_devices(); ++i) {
  1218. splits[i] /= split_sum;
  1219. }
  1220. ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1221. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  1222. const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
  1223. auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
  1224. if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
  1225. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev));
  1226. return {cpu_dev, &pimpl->cpu_buft_list};
  1227. }
  1228. const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
  1229. auto * dev = devices.at(layer_gpu);
  1230. LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev));
  1231. return {dev, &pimpl->gpu_buft_list.at(dev)};
  1232. };
  1233. // assign the input layer
  1234. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  1235. pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
  1236. // assign the repeating layers to the devices according to the splits
  1237. pimpl->dev_layer.resize(n_layer);
  1238. for (int il = 0; il < n_layer; ++il) {
  1239. pimpl->dev_layer[il] = get_layer_buft_list(il);
  1240. }
  1241. // assign the output layer
  1242. pimpl->dev_output = get_layer_buft_list(n_layer);
  1243. // one ggml context per buffer type
  1244. int max_n_tensors = ml.n_tensors;
  1245. max_n_tensors += 1; // duplicated output tensor
  1246. max_n_tensors += n_layer*2; // duplicated rope freq tensors
  1247. const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
  1248. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  1249. auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  1250. auto it = ctx_map.find(buft);
  1251. if (it == ctx_map.end()) {
  1252. ggml_init_params params = {
  1253. /*.mem_size =*/ ctx_size,
  1254. /*.mem_buffer =*/ NULL,
  1255. /*.no_alloc =*/ true,
  1256. };
  1257. ggml_context * ctx = ggml_init(params);
  1258. if (!ctx) {
  1259. throw std::runtime_error(format("failed to create ggml context"));
  1260. }
  1261. ctx_map[buft] = ctx;
  1262. pimpl->ctxs.emplace_back(ctx);
  1263. return ctx;
  1264. }
  1265. return it->second;
  1266. };
  1267. const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
  1268. const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
  1269. // create tensors for the weights
  1270. {
  1271. // note: cast to int64_t since we will use these for the tensor dimensions
  1272. const int64_t n_head = hparams.n_head();
  1273. const int64_t n_head_kv = hparams.n_head_kv();
  1274. const int64_t n_embd = hparams.n_embd;
  1275. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  1276. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  1277. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  1278. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  1279. const int64_t n_ff = hparams.n_ff();
  1280. const int64_t n_embd_gqa = n_embd_v_gqa;
  1281. const int64_t n_vocab = hparams.n_vocab;
  1282. const int64_t n_token_types = vocab.n_token_types();
  1283. const int64_t n_rot = hparams.n_rot;
  1284. const int64_t n_expert = hparams.n_expert;
  1285. const int64_t n_expert_used = hparams.n_expert_used;
  1286. const int64_t n_ctx_train = hparams.n_ctx_train;
  1287. if (n_expert > 0 && hparams.n_expert_used == 0) {
  1288. throw std::runtime_error("model has expert layers but no expert layers are used");
  1289. }
  1290. int n_moved_tensors = 0;
  1291. ggml_tensor * first_moved_tensor = nullptr;
  1292. ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
  1293. ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
  1294. auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
  1295. ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
  1296. if (!t_meta) {
  1297. if (flags & TENSOR_NOT_REQUIRED) {
  1298. return nullptr;
  1299. }
  1300. throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
  1301. }
  1302. // some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
  1303. // the tensor is duplicated
  1304. // to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
  1305. llm_tensor tn_tensor = tn.tensor;
  1306. if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
  1307. tn_tensor = LLM_TENSOR_OUTPUT;
  1308. }
  1309. llm_tensor_info info;
  1310. try {
  1311. info = llm_tensor_info_for(tn_tensor);
  1312. } catch (const std::out_of_range & e) {
  1313. throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
  1314. }
  1315. // skip unused tensors
  1316. if (info.op == GGML_OP_NONE) {
  1317. LLAMA_LOG_WARN("model has unused tensor %s -- ignoring\n", tn.str().c_str());
  1318. ml.n_created++;
  1319. return nullptr;
  1320. }
  1321. // tensors with "bias" suffix are always used with GGML_OP_ADD
  1322. ggml_op op;
  1323. bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
  1324. if (bias) {
  1325. op = GGML_OP_ADD;
  1326. } else {
  1327. op = info.op;
  1328. }
  1329. // sanity checks
  1330. if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
  1331. if (tn.bid != -1) {
  1332. GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
  1333. }
  1334. } else {
  1335. if (tn.bid == -1) {
  1336. GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
  1337. }
  1338. }
  1339. // select the buffer type for this tensor
  1340. buft_list_t * buft_list;
  1341. switch (info.layer) {
  1342. case LLM_TENSOR_LAYER_INPUT:
  1343. buft_list = pimpl->dev_input.buft_list;
  1344. break;
  1345. case LLM_TENSOR_LAYER_OUTPUT:
  1346. buft_list = pimpl->dev_output.buft_list;
  1347. break;
  1348. case LLM_TENSOR_LAYER_REPEATING:
  1349. buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
  1350. break;
  1351. default:
  1352. GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
  1353. }
  1354. ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
  1355. if (!buft) {
  1356. throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
  1357. }
  1358. // avoid using a host buffer when using mmap
  1359. auto * buft_dev = ggml_backend_buft_get_device(buft);
  1360. if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
  1361. auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  1362. buft = ggml_backend_dev_buffer_type(cpu_dev);
  1363. }
  1364. if (buft != buft_list->front().second) {
  1365. n_moved_tensors++;
  1366. if (!first_moved_tensor) {
  1367. first_moved_tensor = t_meta;
  1368. first_moved_from_buft = buft_list->front().second;
  1369. first_moved_to_buft = buft;
  1370. }
  1371. }
  1372. ggml_context * ctx = ctx_for_buft(buft);
  1373. // if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
  1374. if (flags & TENSOR_DUPLICATED) {
  1375. ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
  1376. if (t) {
  1377. return t;
  1378. }
  1379. }
  1380. return ml.create_tensor(ctx, tn, ne, flags);
  1381. };
  1382. layers.resize(n_layer);
  1383. // TODO: move to a separate function
  1384. const auto tn = LLM_TN(arch);
  1385. switch (arch) {
  1386. case LLM_ARCH_LLAMA:
  1387. case LLM_ARCH_REFACT:
  1388. case LLM_ARCH_MINICPM:
  1389. case LLM_ARCH_GRANITE:
  1390. case LLM_ARCH_GRANITE_MOE:
  1391. {
  1392. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1393. // output
  1394. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1395. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1396. // if output is NULL, init from the input tok embed
  1397. if (output == NULL) {
  1398. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1399. }
  1400. for (int i = 0; i < n_layer; ++i) {
  1401. auto & layer = layers[i];
  1402. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1403. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1404. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1405. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1406. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1407. // optional bias tensors
  1408. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1409. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1410. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1411. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1412. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1413. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1414. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1415. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1416. }
  1417. else {
  1418. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1419. }
  1420. if (n_expert == 0) {
  1421. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1422. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1423. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1424. // optional MLP bias
  1425. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1426. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1427. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1428. } else {
  1429. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1430. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1431. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1432. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1433. }
  1434. }
  1435. } break;
  1436. case LLM_ARCH_MLLAMA:
  1437. {
  1438. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab+8}, 0);
  1439. // output
  1440. {
  1441. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1442. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  1443. // if output is NULL, init from the input tok embed
  1444. if (output == NULL) {
  1445. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  1446. }
  1447. }
  1448. for (int i = 0; i < n_layer; ++i) {
  1449. auto & layer = layers[i];
  1450. if (hparams.cross_attention_layers(i)) {
  1451. layer.cross_attn_k_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_NORM, "weight", i), {128}, 0);
  1452. layer.cross_attn_k_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_K_PROJ, "weight", i), {n_embd, 1024}, 0);
  1453. layer.cross_attn_o_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_O_PROJ, "weight", i), {n_embd, n_embd}, 0);
  1454. layer.cross_attn_q_norm = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_NORM, "weight", i), {128}, 0);
  1455. layer.cross_attn_q_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_Q_PROJ, "weight", i), {n_embd, n_embd}, 0);
  1456. layer.cross_attn_v_proj = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_V_PROJ, "weight", i), {n_embd, 1024}, 0);
  1457. layer.cross_attn_attn_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_ATTN_GATE, i), {1}, 0);
  1458. layer.cross_attn_mlp_gate = create_tensor(tn(LLM_TENSOR_CROSS_ATTN_MLP_GATE, i), {1}, 0);
  1459. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1460. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1461. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1462. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1463. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1464. } else {
  1465. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1466. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1467. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1468. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1469. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1470. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1471. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  1472. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1473. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1474. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1475. }
  1476. }
  1477. } break;
  1478. case LLM_ARCH_DECI:
  1479. {
  1480. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1481. // output
  1482. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1483. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1484. // if output is NULL, init from the input tok embed
  1485. if (output == NULL) {
  1486. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1487. }
  1488. for (int i = 0; i < n_layer; ++i) {
  1489. auto & layer = layers[i];
  1490. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
  1491. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
  1492. const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
  1493. const int64_t n_ff = hparams.n_ff(i);
  1494. const int64_t n_head = hparams.n_head(i);
  1495. const int64_t n_head_kv = hparams.n_head_kv(i);
  1496. if (n_head_kv == 0 && n_head > 0) {
  1497. // linear attention for DeciLMCausalModel
  1498. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1499. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1500. }
  1501. else if (n_head_kv > 0) {
  1502. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1503. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  1504. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  1505. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  1506. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  1507. }
  1508. // optional bias tensors
  1509. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1510. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1511. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1512. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1513. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1514. if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
  1515. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1516. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1517. }
  1518. else {
  1519. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1520. }
  1521. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1522. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1523. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1524. // optional MLP bias
  1525. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1526. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1527. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1528. }
  1529. } break;
  1530. case LLM_ARCH_MINICPM3:
  1531. {
  1532. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  1533. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  1534. const int64_t q_lora_rank = hparams.n_lora_q;
  1535. const int64_t kv_lora_rank = hparams.n_lora_kv;
  1536. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1537. // output
  1538. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1539. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1540. // if output is NULL, init from the input tok embed
  1541. if (output == NULL) {
  1542. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1543. }
  1544. for (int i = 0; i < n_layer; ++i) {
  1545. auto & layer = layers[i];
  1546. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1547. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  1548. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  1549. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  1550. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  1551. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  1552. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  1553. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  1554. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1555. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1556. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1557. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1558. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1559. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1560. }
  1561. } break;
  1562. case LLM_ARCH_GROK:
  1563. {
  1564. if (n_expert == 0) {
  1565. throw std::runtime_error("Grok model cannot have zero experts");
  1566. }
  1567. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1568. // output
  1569. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1570. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1571. // if output is NULL, init from the input tok embed
  1572. if (output == NULL) {
  1573. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1574. }
  1575. for (int i = 0; i < n_layer; ++i) {
  1576. auto & layer = layers[i];
  1577. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1578. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1579. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1580. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1581. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1582. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1583. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1584. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1585. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
  1586. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  1587. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1588. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1589. }
  1590. } break;
  1591. case LLM_ARCH_DBRX:
  1592. {
  1593. if (n_expert == 0) {
  1594. throw std::runtime_error("DBRX model cannot have zero experts");
  1595. }
  1596. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1597. // output
  1598. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1599. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1600. for (int i = 0; i < n_layer; ++i) {
  1601. auto & layer = layers[i];
  1602. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1603. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1604. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1605. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1606. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1607. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1608. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  1609. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  1610. }
  1611. } break;
  1612. case LLM_ARCH_BAICHUAN:
  1613. {
  1614. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1615. {
  1616. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1617. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1618. }
  1619. for (int i = 0; i < n_layer; ++i) {
  1620. auto & layer = layers[i];
  1621. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1622. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1623. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1624. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1625. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1626. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1627. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1628. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1629. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1630. }
  1631. } break;
  1632. case LLM_ARCH_FALCON:
  1633. {
  1634. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1635. // output
  1636. {
  1637. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1638. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1639. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1640. if (!output) {
  1641. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  1642. }
  1643. }
  1644. for (int i = 0; i < n_layer; ++i) {
  1645. auto & layer = layers[i];
  1646. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1647. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1648. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1649. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1650. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1651. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1652. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1654. }
  1655. } break;
  1656. case LLM_ARCH_STARCODER:
  1657. {
  1658. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1659. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1660. // output
  1661. {
  1662. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1663. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1664. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1665. if (!output) {
  1666. // needs to be on GPU
  1667. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1668. }
  1669. }
  1670. for (int i = 0; i < n_layer; ++i) {
  1671. auto & layer = layers[i];
  1672. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1673. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1674. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1675. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1676. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1677. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1678. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1679. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1680. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1681. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1682. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1683. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1684. }
  1685. } break;
  1686. case LLM_ARCH_BERT:
  1687. case LLM_ARCH_NOMIC_BERT:
  1688. {
  1689. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1690. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
  1691. if (arch == LLM_ARCH_BERT) {
  1692. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  1693. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
  1694. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1695. cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  1696. cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
  1697. }
  1698. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1699. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1700. for (int i = 0; i < n_layer; ++i) {
  1701. auto & layer = layers[i];
  1702. if (arch == LLM_ARCH_BERT) {
  1703. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1704. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1705. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1706. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1707. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1708. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1709. } else {
  1710. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1711. }
  1712. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1713. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
  1714. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1715. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1716. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1717. if (arch == LLM_ARCH_BERT) {
  1718. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1719. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1720. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1721. } else {
  1722. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1723. }
  1724. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1725. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  1726. }
  1727. } break;
  1728. case LLM_ARCH_JINA_BERT_V2:
  1729. {
  1730. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
  1731. type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
  1732. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
  1733. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
  1734. cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
  1735. cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
  1736. for (int i = 0; i < n_layer; ++i) {
  1737. auto & layer = layers[i]; // JinaBertLayer
  1738. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1739. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1740. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1741. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1742. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1743. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1744. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1745. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1746. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1747. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1748. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
  1749. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
  1750. layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
  1751. layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
  1752. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1753. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1754. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1755. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1756. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1757. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1758. layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
  1759. layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
  1760. }
  1761. } break;
  1762. case LLM_ARCH_BLOOM:
  1763. {
  1764. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1765. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  1766. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  1767. // output
  1768. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1769. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1770. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1771. for (int i = 0; i < n_layer; ++i) {
  1772. auto & layer = layers[i];
  1773. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1774. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1775. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1776. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  1777. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1778. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1779. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1780. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  1781. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1782. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1783. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1784. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1785. }
  1786. } break;
  1787. case LLM_ARCH_MPT:
  1788. {
  1789. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1790. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
  1791. // output
  1792. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1793. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
  1794. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1795. if (!output) {
  1796. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
  1797. }
  1798. for (int i = 0; i < n_layer; ++i) {
  1799. auto & layer = layers[i];
  1800. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1801. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1802. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  1803. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1804. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1805. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1806. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1807. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1808. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1809. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1810. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1811. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1812. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1813. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1814. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1815. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1816. // AWQ ScaleActivation layer
  1817. layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
  1818. }
  1819. } break;
  1820. case LLM_ARCH_STABLELM:
  1821. {
  1822. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1823. // output
  1824. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1825. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1826. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1827. for (int i = 0; i < n_layer; ++i) {
  1828. auto & layer = layers[i];
  1829. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1830. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1831. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1832. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1833. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1834. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1835. // optional bias tensors, present in Stable LM 2 1.6B
  1836. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1837. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1838. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1839. // optional q and k layernorms, present in StableLM 2 12B
  1840. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  1841. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  1842. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  1843. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1844. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  1845. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1846. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1847. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1848. }
  1849. } break;
  1850. case LLM_ARCH_QWEN:
  1851. {
  1852. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1853. // output
  1854. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1855. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1856. for (int i = 0; i < n_layer; ++i) {
  1857. auto & layer = layers[i];
  1858. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1859. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
  1860. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
  1861. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1862. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1863. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
  1864. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
  1865. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
  1866. }
  1867. } break;
  1868. case LLM_ARCH_QWEN2:
  1869. case LLM_ARCH_QWEN2VL:
  1870. {
  1871. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1872. // output
  1873. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1874. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1875. // if output is NULL, init from the input tok embed
  1876. if (output == NULL) {
  1877. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1878. }
  1879. for (int i = 0; i < n_layer; ++i) {
  1880. auto & layer = layers[i];
  1881. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1882. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1883. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1884. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1885. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1886. // optional bias tensors
  1887. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1888. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1889. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1890. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1891. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  1892. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  1893. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1894. }
  1895. } break;
  1896. case LLM_ARCH_QWEN2MOE:
  1897. {
  1898. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1899. // output
  1900. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1901. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1902. for (int i = 0; i < n_layer; ++i) {
  1903. auto & layer = layers[i];
  1904. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1905. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1906. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1907. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1908. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1909. // optional bias tensors
  1910. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1911. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1912. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1913. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  1914. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  1915. if (n_expert == 0) {
  1916. throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
  1917. }
  1918. if (n_expert_used == 0) {
  1919. throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
  1920. }
  1921. // MoE branch
  1922. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  1923. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1924. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  1925. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  1926. // Shared expert branch
  1927. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  1928. layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
  1929. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1930. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
  1931. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
  1932. }
  1933. } break;
  1934. case LLM_ARCH_PHI2:
  1935. {
  1936. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  1937. // output
  1938. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  1939. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1940. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  1941. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
  1942. for (int i = 0; i < n_layer; ++i) {
  1943. auto & layer = layers[i];
  1944. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  1945. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  1946. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1947. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
  1948. if (layer.wqkv == nullptr) {
  1949. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  1950. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  1951. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  1952. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  1953. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  1954. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  1955. }
  1956. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  1957. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  1958. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  1959. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  1960. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  1961. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  1962. }
  1963. } break;
  1964. case LLM_ARCH_PHI3:
  1965. {
  1966. const int64_t n_embd_head = n_embd / n_head;
  1967. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  1968. // output
  1969. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  1970. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  1971. // if output is NULL, init from the input tok embed
  1972. if (output == NULL) {
  1973. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  1974. }
  1975. for (int i = 0; i < n_layer; ++i) {
  1976. auto & layer = layers[i];
  1977. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  1978. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
  1979. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  1980. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  1981. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  1982. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
  1983. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1984. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  1985. }
  1986. } break;
  1987. case LLM_ARCH_PHIMOE:
  1988. {
  1989. const int64_t n_embd_head = n_embd / n_head;
  1990. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  1991. // output
  1992. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  1993. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  1994. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
  1995. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), { n_vocab }, 0);
  1996. for (int i = 0; i < n_layer; ++i) {
  1997. auto & layer = layers[i];
  1998. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  1999. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
  2000. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
  2001. if (layer.wqkv == nullptr) {
  2002. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2003. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2004. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2005. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2006. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2007. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2008. }
  2009. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2010. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, 0);
  2011. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
  2012. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), { n_embd }, 0);
  2013. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2014. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2015. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2016. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2017. layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2018. layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2019. }
  2020. } break;
  2021. case LLM_ARCH_PLAMO:
  2022. {
  2023. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2024. // output
  2025. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2026. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2027. for (int i = 0; i < n_layer; ++i) {
  2028. auto & layer = layers[i];
  2029. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2030. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2031. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2032. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2033. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2034. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2035. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2036. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2037. }
  2038. } break;
  2039. case LLM_ARCH_GPT2:
  2040. {
  2041. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2042. pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
  2043. // output
  2044. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2045. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2046. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2047. for (int i = 0; i < n_layer; ++i) {
  2048. auto & layer = layers[i];
  2049. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2050. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2051. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2052. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2053. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2054. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2055. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2056. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2057. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2058. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2059. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2060. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2061. }
  2062. } break;
  2063. case LLM_ARCH_CODESHELL:
  2064. {
  2065. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2066. // output
  2067. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2068. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2069. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2070. for (int i = 0; i < n_layer; ++i) {
  2071. auto & layer = layers[i];
  2072. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2073. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2074. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2075. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2076. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2077. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2078. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2079. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2080. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2081. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2082. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2083. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2084. }
  2085. } break;
  2086. case LLM_ARCH_ORION:
  2087. {
  2088. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2089. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2090. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2091. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2092. for (int i = 0; i < n_layer; ++i) {
  2093. auto & layer = layers[i];
  2094. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2095. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2096. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2097. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2098. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2099. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2100. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2101. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2102. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2103. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2104. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2105. }
  2106. } break;
  2107. case LLM_ARCH_INTERNLM2:
  2108. {
  2109. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2110. // output
  2111. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2112. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2113. for (int i = 0; i < n_layer; ++i) {
  2114. auto & layer = layers[i];
  2115. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2116. // layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2117. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2118. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2119. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2120. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2121. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2122. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2123. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2124. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2125. }
  2126. } break;
  2127. case LLM_ARCH_GEMMA:
  2128. {
  2129. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2130. // output
  2131. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2132. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2133. for (int i = 0; i < n_layer; ++i) {
  2134. auto & layer = layers[i];
  2135. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2136. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2137. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2138. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2139. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2140. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2141. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2142. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2143. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2144. }
  2145. } break;
  2146. case LLM_ARCH_GEMMA2:
  2147. {
  2148. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2149. // output
  2150. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2151. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  2152. for (int i = 0; i < n_layer; ++i) {
  2153. auto & layer = layers[i];
  2154. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2155. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2156. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2157. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2158. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2159. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2160. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2161. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2162. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2163. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2164. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2165. }
  2166. } break;
  2167. case LLM_ARCH_GEMMA3:
  2168. {
  2169. } break;
  2170. case LLM_ARCH_STARCODER2:
  2171. {
  2172. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2173. // output
  2174. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2175. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2176. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2177. // if output is NULL, init from the input tok embed
  2178. if (output == NULL) {
  2179. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2180. }
  2181. for (int i = 0; i < n_layer; ++i) {
  2182. auto & layer = layers[i];
  2183. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2184. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2185. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2186. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2187. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2188. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2189. // optional bias tensors
  2190. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
  2191. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
  2192. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
  2193. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2194. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2195. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2196. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2197. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2198. // optional bias tensors
  2199. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2200. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
  2201. }
  2202. } break;
  2203. case LLM_ARCH_MAMBA:
  2204. {
  2205. const int64_t d_conv = hparams.ssm_d_conv;
  2206. const int64_t d_inner = hparams.ssm_d_inner;
  2207. const int64_t d_state = hparams.ssm_d_state;
  2208. const int64_t dt_rank = hparams.ssm_dt_rank;
  2209. // only an expansion factor of 2 is supported for now
  2210. if (2 * n_embd != d_inner) {
  2211. throw std::runtime_error("only an expansion factor of 2 is supported for now");
  2212. }
  2213. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2214. // output
  2215. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2216. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2217. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  2218. if (output == NULL) {
  2219. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2220. }
  2221. for (int i = 0; i < n_layer; ++i) {
  2222. auto & layer = layers[i];
  2223. // norm
  2224. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2225. layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
  2226. layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
  2227. layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
  2228. layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
  2229. layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
  2230. layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
  2231. // no "weight" suffix for these
  2232. layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
  2233. layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
  2234. // out_proj
  2235. layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
  2236. }
  2237. } break;
  2238. case LLM_ARCH_XVERSE:
  2239. {
  2240. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2241. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2242. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2243. for (int i = 0; i < n_layer; ++i) {
  2244. auto & layer = layers[i];
  2245. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2246. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2247. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2248. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2249. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2250. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2251. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2252. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2253. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2254. }
  2255. } break;
  2256. case LLM_ARCH_COMMAND_R:
  2257. {
  2258. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2259. // output
  2260. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2261. // init output from the input tok embed
  2262. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2263. for (int i = 0; i < n_layer; ++i) {
  2264. auto & layer = layers[i];
  2265. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2266. if (n_layer >= 64){
  2267. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  2268. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  2269. }
  2270. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2271. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2272. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2273. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2274. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2275. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2276. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2277. }
  2278. } break;
  2279. case LLM_ARCH_COHERE2:
  2280. {
  2281. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
  2282. // output
  2283. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
  2284. // init output from the input tok embed
  2285. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
  2286. TENSOR_DUPLICATED);
  2287. for (int i = 0; i < n_layer; ++i) {
  2288. auto & layer = layers[i];
  2289. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
  2290. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
  2291. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
  2292. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
  2293. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
  2294. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
  2295. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
  2296. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
  2297. }
  2298. }
  2299. break;
  2300. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  2301. {
  2302. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2303. // output
  2304. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2305. // if output is NULL, init from the input tok embed
  2306. if (output == NULL) {
  2307. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2308. }
  2309. for (int i = 0; i < n_layer; ++i) {
  2310. auto & layer = layers[i];
  2311. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2312. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2313. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2314. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2315. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2316. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2317. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2318. }
  2319. } break;
  2320. case LLM_ARCH_OLMO2:
  2321. {
  2322. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2323. // output
  2324. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2325. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2326. for (int i = 0; i < n_layer; ++i) {
  2327. auto & layer = layers[i];
  2328. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2329. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2330. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2331. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2332. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2333. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  2334. layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
  2335. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2336. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2337. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2338. layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
  2339. }
  2340. } break;
  2341. case LLM_ARCH_OLMOE:
  2342. {
  2343. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2344. // output
  2345. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2346. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2347. for (int i = 0; i < n_layer; ++i) {
  2348. auto & layer = layers[i];
  2349. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2350. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2351. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2352. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2353. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2354. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
  2355. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
  2356. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2357. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2358. if (n_expert == 0) {
  2359. throw std::runtime_error("n_expert must be > 0");
  2360. }
  2361. if (n_expert_used == 0) {
  2362. throw std::runtime_error("n_expert_used must be > 0");
  2363. }
  2364. // MoE branch
  2365. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2366. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
  2367. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2368. }
  2369. } break;
  2370. case LLM_ARCH_OPENELM:
  2371. {
  2372. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2373. // output
  2374. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2375. // init output from the input tok embed
  2376. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2377. for (int i = 0; i < n_layer; ++i) {
  2378. const int64_t n_head = hparams.n_head(i);
  2379. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  2380. const int64_t n_ff = hparams.n_ff(i);
  2381. auto & layer = layers[i];
  2382. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2383. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
  2384. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
  2385. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
  2386. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
  2387. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2388. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2389. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2390. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2391. }
  2392. } break;
  2393. case LLM_ARCH_GPTNEOX:
  2394. {
  2395. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2396. // output
  2397. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2398. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2399. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2400. for (int i = 0; i < n_layer; ++i) {
  2401. auto & layer = layers[i];
  2402. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2403. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2404. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2405. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2406. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2407. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2408. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2409. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2410. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2411. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2412. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2413. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2414. }
  2415. } break;
  2416. case LLM_ARCH_ARCTIC:
  2417. {
  2418. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2419. // output
  2420. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2421. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2422. // if output is NULL, init from the input tok embed
  2423. if (output == NULL) {
  2424. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2425. }
  2426. for (int i = 0; i < n_layer; ++i) {
  2427. auto & layer = layers[i];
  2428. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2429. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2430. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2431. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2432. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2433. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2434. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
  2435. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
  2436. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
  2437. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2438. layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
  2439. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  2440. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
  2441. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
  2442. }
  2443. } break;
  2444. case LLM_ARCH_DEEPSEEK:
  2445. {
  2446. const int64_t n_ff_exp = hparams.n_ff_exp;
  2447. const int64_t n_expert_shared = hparams.n_expert_shared;
  2448. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2449. // output
  2450. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2451. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2452. for (int i = 0; i < n_layer; ++i) {
  2453. auto & layer = layers[i];
  2454. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2455. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2456. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2457. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2458. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2459. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2460. if (i < (int) hparams.n_layer_dense_lead) {
  2461. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2462. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2463. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2464. } else {
  2465. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2466. if (n_expert == 0) {
  2467. throw std::runtime_error("n_expert must be > 0");
  2468. }
  2469. if (n_expert_used == 0) {
  2470. throw std::runtime_error("n_expert_used must be > 0");
  2471. }
  2472. // MoE branch
  2473. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2474. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2475. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2476. // Shared expert branch
  2477. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2478. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2479. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2480. }
  2481. }
  2482. } break;
  2483. case LLM_ARCH_DEEPSEEK2:
  2484. {
  2485. const bool is_lite = (hparams.n_layer == 27);
  2486. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  2487. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  2488. const int64_t q_lora_rank = hparams.n_lora_q;
  2489. const int64_t kv_lora_rank = hparams.n_lora_kv;
  2490. const int64_t n_ff_exp = hparams.n_ff_exp;
  2491. const int64_t n_expert_shared = hparams.n_expert_shared;
  2492. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2493. // output
  2494. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2495. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2496. for (int i = 0; i < n_layer; ++i) {
  2497. auto & layer = layers[i];
  2498. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2499. if (!is_lite) {
  2500. layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
  2501. }
  2502. layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
  2503. if (!is_lite) {
  2504. layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
  2505. layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
  2506. } else {
  2507. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2508. }
  2509. layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
  2510. layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
  2511. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
  2512. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2513. if (i < (int) hparams.n_layer_dense_lead) {
  2514. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2515. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2516. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2517. } else {
  2518. layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
  2519. layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
  2520. if (n_expert == 0) {
  2521. throw std::runtime_error("n_expert must be > 0");
  2522. }
  2523. if (n_expert_used == 0) {
  2524. throw std::runtime_error("n_expert_used must be > 0");
  2525. }
  2526. // MoE branch
  2527. layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2528. layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
  2529. layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
  2530. // Shared expert branch
  2531. layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2532. layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
  2533. layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
  2534. }
  2535. }
  2536. } break;
  2537. case LLM_ARCH_BITNET:
  2538. {
  2539. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2540. // output
  2541. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2542. for (int i = 0; i < n_layer; ++i) {
  2543. auto & layer = layers[i];
  2544. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2545. layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
  2546. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2547. layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2548. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2549. layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2550. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2551. layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2552. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2553. layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2554. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2555. layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
  2556. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2557. layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2558. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2559. layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2560. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2561. layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
  2562. }
  2563. } break;
  2564. case LLM_ARCH_T5:
  2565. {
  2566. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2567. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2568. // output
  2569. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2570. output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2571. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2572. // if output is NULL, init from the input tok embed
  2573. if (output == NULL) {
  2574. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2575. }
  2576. for (int i = 0; i < n_layer; ++i) {
  2577. auto & layer = layers[i];
  2578. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2579. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2580. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2581. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2582. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2583. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2584. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2585. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2586. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2587. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2588. layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2589. layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2590. layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2591. layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2592. layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2593. layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2594. layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
  2595. // this tensor seems to be unused in HF transformers implementation
  2596. layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2597. layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2598. layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2599. layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2600. layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2601. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
  2602. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2603. layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2604. layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2605. }
  2606. } break;
  2607. case LLM_ARCH_T5ENCODER:
  2608. {
  2609. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  2610. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2611. // output
  2612. output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2613. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2614. // if output is NULL, init from the input tok embed
  2615. if (output == NULL) {
  2616. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2617. }
  2618. for (int i = 0; i < n_layer; ++i) {
  2619. auto & layer = layers[i];
  2620. layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
  2621. layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
  2622. layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2623. layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2624. layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2625. layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
  2626. layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
  2627. layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
  2628. layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2629. layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2630. }
  2631. } break;
  2632. case LLM_ARCH_JAIS:
  2633. {
  2634. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2635. // output
  2636. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2637. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2638. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2639. for (int i = 0; i < n_layer; ++i) {
  2640. auto & layer = layers[i];
  2641. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2642. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2643. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
  2644. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
  2645. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2646. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
  2647. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2648. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2649. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2650. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
  2651. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2652. layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
  2653. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2654. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
  2655. }
  2656. } break;
  2657. case LLM_ARCH_CHATGLM:
  2658. {
  2659. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2660. // output
  2661. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2662. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2663. for (int i = 0; i < n_layer; ++i) {
  2664. auto & layer = layers[i];
  2665. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2666. layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2667. layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2668. if (layer.wqkv == nullptr) {
  2669. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2670. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2671. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2672. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2673. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2674. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2675. }
  2676. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2677. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2678. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
  2679. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
  2680. }
  2681. } break;
  2682. case LLM_ARCH_NEMOTRON:
  2683. {
  2684. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2685. // output
  2686. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2687. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2688. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2689. for (int i = 0; i < n_layer; ++i) {
  2690. auto & layer = layers[i];
  2691. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2692. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2693. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2694. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2695. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2696. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2697. // optional bias tensors
  2698. layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2699. layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2700. layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  2701. layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2702. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2703. layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
  2704. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2705. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2706. // optional MLP bias
  2707. layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
  2708. layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
  2709. }
  2710. } break;
  2711. case LLM_ARCH_EXAONE:
  2712. {
  2713. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2714. // output
  2715. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2716. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2717. for (int i = 0; i < n_layer; ++i) {
  2718. auto & layer = layers[i];
  2719. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2720. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2721. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2722. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2723. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2724. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2725. layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
  2726. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2727. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2728. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2729. }
  2730. } break;
  2731. case LLM_ARCH_RWKV6:
  2732. {
  2733. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2734. // Block 0, LN0
  2735. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
  2736. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
  2737. // output
  2738. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2739. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2740. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2741. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  2742. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  2743. const int head_size = hparams.wkv_head_size;
  2744. const int attn_hidden_size = n_embd;
  2745. const int ffn_size = hparams.n_ff_arr[0];
  2746. for (int i = 0; i < n_layer; ++i) {
  2747. auto & layer = layers[i];
  2748. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2749. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
  2750. layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
  2751. layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
  2752. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  2753. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  2754. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  2755. layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2756. layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2757. layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2758. layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2759. layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2760. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2761. GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
  2762. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
  2763. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  2764. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  2765. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  2766. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
  2767. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2768. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2769. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2770. layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
  2771. layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
  2772. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  2773. layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
  2774. layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
  2775. layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
  2776. layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
  2777. layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
  2778. }
  2779. } break;
  2780. case LLM_ARCH_RWKV6QWEN2:
  2781. {
  2782. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2783. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2784. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2785. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
  2786. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  2787. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  2788. const int head_size = hparams.wkv_head_size;
  2789. const int attn_hidden_size = n_embd;
  2790. const int n_head_kv = hparams.n_head_kv();
  2791. int attn_key_value_size;
  2792. if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
  2793. attn_key_value_size = attn_hidden_size;
  2794. } else {
  2795. attn_key_value_size = n_head_kv * head_size;
  2796. }
  2797. for (int i = 0; i < n_layer; ++i) {
  2798. auto & layer = layers[i];
  2799. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2800. layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
  2801. layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
  2802. layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
  2803. layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
  2804. layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2805. layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
  2806. layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
  2807. layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
  2808. layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
  2809. layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
  2810. layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2811. layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
  2812. // optional bias tensors
  2813. layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2814. layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2815. layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2816. layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
  2817. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2818. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2819. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2820. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2821. }
  2822. } break;
  2823. case LLM_ARCH_CHAMELEON:
  2824. {
  2825. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2826. // output
  2827. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2828. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
  2829. // if output is NULL, init from the input tok embed
  2830. if (output == NULL) {
  2831. output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
  2832. }
  2833. for (int i = 0; i < n_layer; ++i) {
  2834. auto & layer = layers[i];
  2835. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2836. layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
  2837. layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
  2838. layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
  2839. layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
  2840. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
  2841. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
  2842. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
  2843. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  2844. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2845. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2846. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2847. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2848. }
  2849. } break;
  2850. case LLM_ARCH_SOLAR:
  2851. {
  2852. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
  2853. // output
  2854. {
  2855. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2856. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  2857. }
  2858. for (int i = 0; i < n_layer; ++i) {
  2859. auto & layer = layers[i];
  2860. layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
  2861. layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
  2862. layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
  2863. layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
  2864. layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
  2865. layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  2866. layer.bskcn_tv = create_tensor(tn(LLM_TENSOR_BSKCN_TV, "weight", i), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  2867. layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
  2868. layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
  2869. layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
  2870. }
  2871. } break;
  2872. case LLM_ARCH_WAVTOKENIZER_DEC:
  2873. {
  2874. tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
  2875. conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
  2876. conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
  2877. // posnet
  2878. {
  2879. const int64_t n_embd = hparams.posnet.n_embd;
  2880. for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
  2881. auto & layer = layers[i].posnet;
  2882. // posnet:
  2883. //
  2884. // - resnet
  2885. // - resnet
  2886. // - attn
  2887. // - resnet
  2888. // - resnet
  2889. // - norm
  2890. //
  2891. switch (i) {
  2892. case 0:
  2893. case 1:
  2894. case 3:
  2895. case 4:
  2896. {
  2897. layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
  2898. layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
  2899. layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
  2900. layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
  2901. layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
  2902. layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
  2903. layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
  2904. layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
  2905. } break;
  2906. case 2:
  2907. {
  2908. layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  2909. layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  2910. layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
  2911. layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
  2912. layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
  2913. layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
  2914. layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
  2915. layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
  2916. layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
  2917. layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
  2918. } break;
  2919. case 5:
  2920. {
  2921. layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
  2922. layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
  2923. } break;
  2924. default: GGML_ABORT("unknown posnet layer");
  2925. };
  2926. }
  2927. }
  2928. GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
  2929. tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
  2930. tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
  2931. // convnext
  2932. {
  2933. const int64_t n_embd = hparams.convnext.n_embd;
  2934. for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
  2935. auto & layer = layers[i].convnext;
  2936. layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
  2937. layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
  2938. layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
  2939. layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
  2940. layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
  2941. layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
  2942. layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
  2943. layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
  2944. layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
  2945. }
  2946. // output
  2947. output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
  2948. output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
  2949. }
  2950. output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
  2951. output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
  2952. } break;
  2953. default:
  2954. throw std::runtime_error("unknown architecture");
  2955. }
  2956. if (n_moved_tensors > 0) {
  2957. LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
  2958. __func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
  2959. ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
  2960. }
  2961. }
  2962. ml.done_getting_tensors();
  2963. ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
  2964. pimpl->mappings.reserve(ml.mappings.size());
  2965. // create the backend buffers
  2966. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  2967. ctx_bufs.reserve(ctx_map.size());
  2968. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  2969. const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  2970. pimpl->bufs.reserve(n_max_backend_buffer);
  2971. for (auto & it : ctx_map) {
  2972. ggml_backend_buffer_type_t buft = it.first;
  2973. ggml_context * ctx = it.second;
  2974. // skip contexts without tensors
  2975. if (ggml_get_first_tensor(ctx) == nullptr) {
  2976. continue;
  2977. }
  2978. llama_buf_map buf_map;
  2979. buf_map.reserve(n_max_backend_buffer);
  2980. // check if it is possible to use buffer_from_host_ptr with this buffer type
  2981. ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
  2982. if (!dev) {
  2983. // FIXME: workaround for CPU backend buft having a NULL device
  2984. dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
  2985. }
  2986. ggml_backend_dev_props props;
  2987. ggml_backend_dev_get_props(dev, &props);
  2988. bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
  2989. bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
  2990. if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
  2991. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  2992. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  2993. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  2994. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  2995. void * addr = nullptr;
  2996. size_t first, last; // NOLINT
  2997. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  2998. if (first >= last) {
  2999. continue;
  3000. }
  3001. const size_t max_size = ggml_get_max_tensor_size(ctx);
  3002. ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
  3003. if (buf == nullptr) {
  3004. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3005. }
  3006. pimpl->bufs.emplace_back(buf);
  3007. buf_map.emplace(idx, buf);
  3008. }
  3009. }
  3010. else {
  3011. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  3012. if (buf == nullptr) {
  3013. throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
  3014. }
  3015. pimpl->bufs.emplace_back(buf);
  3016. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  3017. pimpl->mlock_bufs.emplace_back(new llama_mlock);
  3018. auto & mlock_buf = pimpl->mlock_bufs.back();
  3019. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  3020. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  3021. }
  3022. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  3023. buf_map.emplace(idx, buf);
  3024. }
  3025. }
  3026. if (pimpl->bufs.empty()) {
  3027. throw std::runtime_error("failed to allocate buffer");
  3028. }
  3029. for (auto & buf : buf_map) {
  3030. // indicate that this buffer contains weights
  3031. // this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
  3032. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  3033. }
  3034. ctx_bufs.emplace_back(ctx, buf_map);
  3035. }
  3036. if (llama_supports_gpu_offload()) {
  3037. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  3038. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  3039. if (n_gpu_layers > (int) hparams.n_layer) {
  3040. LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
  3041. }
  3042. const int max_backend_supported_layers = hparams.n_layer + 1;
  3043. const int max_offloadable_layers = hparams.n_layer + 1;
  3044. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  3045. }
  3046. // print memory requirements per buffer type
  3047. for (auto & buf : pimpl->bufs) {
  3048. LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
  3049. }
  3050. // populate tensors_by_name
  3051. for (auto & ctx : pimpl->ctxs) {
  3052. for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
  3053. tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  3054. }
  3055. }
  3056. // load tensor data
  3057. for (auto & it : ctx_bufs) {
  3058. ggml_context * ctx = it.first;
  3059. auto & bufs = it.second;
  3060. if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
  3061. return false;
  3062. }
  3063. }
  3064. if (use_mmap_buffer) {
  3065. for (auto & mapping : ml.mappings) {
  3066. pimpl->mappings.emplace_back(std::move(mapping));
  3067. }
  3068. }
  3069. return true;
  3070. }
  3071. std::string llama_model::arch_name() const {
  3072. return llm_arch_name(arch);
  3073. }
  3074. std::string llama_model::type_name() const {
  3075. return llm_type_name(type);
  3076. }
  3077. std::string llama_model::desc() const {
  3078. return pimpl->desc_str;
  3079. }
  3080. size_t llama_model::size() const {
  3081. return pimpl->n_bytes;
  3082. }
  3083. size_t llama_model::max_nodes() const {
  3084. return std::max<size_t>(8192, tensors_by_name.size()*5);
  3085. }
  3086. size_t llama_model::n_devices() const {
  3087. return devices.size();
  3088. }
  3089. uint64_t llama_model::n_elements() const {
  3090. return pimpl->n_elements;
  3091. }
  3092. void llama_model::print_info() const {
  3093. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  3094. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  3095. bool is_var = false;
  3096. std::vector<uint32_t> v;
  3097. for (uint32_t i = 0; i < n; ++i) {
  3098. v.push_back(f(i));
  3099. if (v[i] != v[0]) {
  3100. is_var = true;
  3101. }
  3102. }
  3103. std::stringstream ss;
  3104. if (is_var) {
  3105. ss << "[";
  3106. for (uint32_t i = 0; i < n; ++i) {
  3107. ss << v[i];
  3108. if (i < n - 1) {
  3109. ss << ", ";
  3110. }
  3111. }
  3112. ss << "]";
  3113. } else {
  3114. ss << v[0];
  3115. }
  3116. return ss.str();
  3117. };
  3118. // hparams
  3119. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
  3120. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  3121. if (!hparams.vocab_only) {
  3122. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  3123. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  3124. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  3125. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  3126. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  3127. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  3128. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  3129. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  3130. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  3131. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  3132. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  3133. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  3134. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  3135. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  3136. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  3137. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  3138. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  3139. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  3140. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  3141. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  3142. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  3143. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  3144. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  3145. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  3146. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  3147. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  3148. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  3149. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  3150. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  3151. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  3152. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  3153. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  3154. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  3155. }
  3156. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
  3157. if (pimpl->n_elements >= 1e12) {
  3158. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
  3159. } else if (pimpl->n_elements >= 1e9) {
  3160. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
  3161. } else if (pimpl->n_elements >= 1e6) {
  3162. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
  3163. } else {
  3164. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
  3165. }
  3166. // general kv
  3167. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
  3168. if (arch == LLM_ARCH_DEEPSEEK) {
  3169. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3170. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3171. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3172. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3173. }
  3174. if (arch == LLM_ARCH_DEEPSEEK2) {
  3175. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  3176. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  3177. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  3178. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3179. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  3180. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  3181. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  3182. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
  3183. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  3184. }
  3185. if (arch == LLM_ARCH_QWEN2MOE) {
  3186. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  3187. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  3188. }
  3189. if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
  3190. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  3191. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  3192. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  3193. }
  3194. vocab.print_info();
  3195. }
  3196. ggml_backend_dev_t llama_model::dev_layer(int il) const {
  3197. return pimpl->dev_layer.at(il).dev;
  3198. }
  3199. ggml_backend_dev_t llama_model::dev_output() const {
  3200. return pimpl->dev_output.dev;
  3201. }
  3202. template<typename F>
  3203. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  3204. ggml_init_params params = {
  3205. /*.mem_size =*/ ggml_tensor_overhead()*8,
  3206. /*.mem_buffer =*/ NULL,
  3207. /*.no_alloc =*/ true,
  3208. };
  3209. ggml_context_ptr ctx { ggml_init(params) };
  3210. if (!ctx) {
  3211. throw std::runtime_error(format("failed to create ggml context"));
  3212. }
  3213. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  3214. ggml_tensor * op_tensor = fn(ctx.get());
  3215. for (int i = 0; i < GGML_MAX_SRC; i++) {
  3216. if (op_tensor->src[i] != nullptr) {
  3217. assert(op_tensor->src[i]->buffer == nullptr);
  3218. op_tensor->src[i]->buffer = buf.get();
  3219. }
  3220. }
  3221. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  3222. return op_supported;
  3223. }
  3224. template<typename F>
  3225. static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
  3226. for (const auto & cur : buft_list) {
  3227. ggml_backend_dev_t cur_dev = cur.first;
  3228. ggml_backend_buffer_type_t cur_buft = cur.second;
  3229. if (buft_supported(cur_buft, cur_dev, fn)) {
  3230. return cur_buft;
  3231. }
  3232. }
  3233. throw std::runtime_error(format("no suitable buffer type found"));
  3234. }
  3235. ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
  3236. return ::select_buft(
  3237. *pimpl->dev_layer.at(il).buft_list,
  3238. [&](ggml_context * ctx) {
  3239. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3240. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
  3241. return ggml_add(ctx, cur, layer_dir);
  3242. });
  3243. }
  3244. const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
  3245. auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
  3246. [name](const std::pair<std::string, struct ggml_tensor *> & it) {
  3247. return it.first == name;
  3248. });
  3249. if (it == tensors_by_name.end()) {
  3250. return nullptr;
  3251. }
  3252. return it->second;
  3253. }
  3254. //
  3255. // interface implementation
  3256. //
  3257. struct llama_model_params llama_model_default_params() {
  3258. struct llama_model_params result = {
  3259. /*.devices =*/ nullptr,
  3260. /*.n_gpu_layers =*/ 0,
  3261. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  3262. /*.main_gpu =*/ 0,
  3263. /*.tensor_split =*/ nullptr,
  3264. /*.progress_callback =*/ nullptr,
  3265. /*.progress_callback_user_data =*/ nullptr,
  3266. /*.kv_overrides =*/ nullptr,
  3267. /*.vocab_only =*/ false,
  3268. /*.use_mmap =*/ true,
  3269. /*.use_mlock =*/ false,
  3270. /*.check_tensors =*/ false,
  3271. };
  3272. #ifdef GGML_USE_METAL
  3273. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  3274. result.n_gpu_layers = 999;
  3275. #endif
  3276. return result;
  3277. }
  3278. const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) {
  3279. return &model->vocab;
  3280. }
  3281. void llama_free_model(struct llama_model * model) {
  3282. llama_model_free(model);
  3283. }
  3284. void llama_model_free(struct llama_model * model) {
  3285. delete model;
  3286. }
  3287. int32_t llama_model_n_ctx_train(const struct llama_model * model) {
  3288. return model->hparams.n_ctx_train;
  3289. }
  3290. int32_t llama_model_n_embd(const struct llama_model * model) {
  3291. return model->hparams.n_embd;
  3292. }
  3293. int32_t llama_model_n_layer(const struct llama_model * model) {
  3294. return model->hparams.n_layer;
  3295. }
  3296. int32_t llama_model_n_head(const struct llama_model * model) {
  3297. return model->hparams.n_head();
  3298. }
  3299. int32_t llama_model_n_head_kv(const struct llama_model * model) {
  3300. return model->hparams.n_head_kv();
  3301. }
  3302. // deprecated
  3303. int32_t llama_n_ctx_train(const struct llama_model * model) {
  3304. return llama_model_n_ctx_train(model);
  3305. }
  3306. // deprecated
  3307. int32_t llama_n_embd(const struct llama_model * model) {
  3308. return llama_model_n_embd(model);
  3309. }
  3310. // deprecated
  3311. int32_t llama_n_layer(const struct llama_model * model) {
  3312. return llama_model_n_layer(model);
  3313. }
  3314. // deprecated
  3315. int32_t llama_n_head(const struct llama_model * model) {
  3316. return llama_model_n_head(model);
  3317. }
  3318. enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
  3319. switch (model->arch) {
  3320. // these models do not use RoPE
  3321. case LLM_ARCH_GPT2:
  3322. case LLM_ARCH_GPTJ:
  3323. case LLM_ARCH_MPT:
  3324. case LLM_ARCH_REFACT:
  3325. case LLM_ARCH_BLOOM:
  3326. case LLM_ARCH_MAMBA:
  3327. case LLM_ARCH_JINA_BERT_V2:
  3328. case LLM_ARCH_T5:
  3329. case LLM_ARCH_T5ENCODER:
  3330. case LLM_ARCH_JAIS:
  3331. case LLM_ARCH_RWKV6:
  3332. case LLM_ARCH_RWKV6QWEN2:
  3333. case LLM_ARCH_WAVTOKENIZER_DEC:
  3334. return LLAMA_ROPE_TYPE_NONE;
  3335. // use what we call a normal RoPE, operating on pairs of consecutive head values
  3336. case LLM_ARCH_LLAMA:
  3337. case LLM_ARCH_MLLAMA:
  3338. case LLM_ARCH_DECI:
  3339. case LLM_ARCH_BAICHUAN:
  3340. case LLM_ARCH_STARCODER:
  3341. case LLM_ARCH_PLAMO:
  3342. case LLM_ARCH_ORION:
  3343. case LLM_ARCH_INTERNLM2:
  3344. case LLM_ARCH_MINICPM:
  3345. case LLM_ARCH_XVERSE:
  3346. case LLM_ARCH_COMMAND_R:
  3347. case LLM_ARCH_COHERE2:
  3348. case LLM_ARCH_OLMO:
  3349. case LLM_ARCH_ARCTIC:
  3350. case LLM_ARCH_DEEPSEEK:
  3351. case LLM_ARCH_DEEPSEEK2:
  3352. case LLM_ARCH_CHATGLM:
  3353. case LLM_ARCH_GRANITE:
  3354. case LLM_ARCH_GRANITE_MOE:
  3355. case LLM_ARCH_CHAMELEON:
  3356. case LLM_ARCH_SOLAR:
  3357. return LLAMA_ROPE_TYPE_NORM;
  3358. // the pairs of head values are offset by n_rot/2
  3359. case LLM_ARCH_FALCON:
  3360. case LLM_ARCH_GROK:
  3361. case LLM_ARCH_DBRX:
  3362. case LLM_ARCH_BERT:
  3363. case LLM_ARCH_NOMIC_BERT:
  3364. case LLM_ARCH_STABLELM:
  3365. case LLM_ARCH_BITNET:
  3366. case LLM_ARCH_QWEN:
  3367. case LLM_ARCH_QWEN2:
  3368. case LLM_ARCH_QWEN2MOE:
  3369. case LLM_ARCH_OLMO2:
  3370. case LLM_ARCH_OLMOE:
  3371. case LLM_ARCH_PHI2:
  3372. case LLM_ARCH_PHI3:
  3373. case LLM_ARCH_PHIMOE:
  3374. case LLM_ARCH_GEMMA:
  3375. case LLM_ARCH_GEMMA2:
  3376. case LLM_ARCH_GEMMA3:
  3377. case LLM_ARCH_STARCODER2:
  3378. case LLM_ARCH_OPENELM:
  3379. case LLM_ARCH_GPTNEOX:
  3380. case LLM_ARCH_CODESHELL:
  3381. case LLM_ARCH_NEMOTRON:
  3382. case LLM_ARCH_EXAONE:
  3383. case LLM_ARCH_MINICPM3:
  3384. return LLAMA_ROPE_TYPE_NEOX;
  3385. case LLM_ARCH_QWEN2VL:
  3386. return LLAMA_ROPE_TYPE_MROPE;
  3387. // all model arches should be listed explicitly here
  3388. case LLM_ARCH_UNKNOWN:
  3389. GGML_ABORT("unknown architecture");
  3390. }
  3391. return LLAMA_ROPE_TYPE_NONE;
  3392. }
  3393. float llama_model_rope_freq_scale_train(const struct llama_model * model) {
  3394. return model->hparams.rope_freq_scale_train;
  3395. }
  3396. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  3397. const auto & it = model->gguf_kv.find(key);
  3398. if (it == model->gguf_kv.end()) {
  3399. if (buf_size > 0) {
  3400. buf[0] = '\0';
  3401. }
  3402. return -1;
  3403. }
  3404. return snprintf(buf, buf_size, "%s", it->second.c_str());
  3405. }
  3406. int32_t llama_model_meta_count(const struct llama_model * model) {
  3407. return (int)model->gguf_kv.size();
  3408. }
  3409. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  3410. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  3411. if (buf_size > 0) {
  3412. buf[0] = '\0';
  3413. }
  3414. return -1;
  3415. }
  3416. auto it = model->gguf_kv.begin();
  3417. std::advance(it, i);
  3418. return snprintf(buf, buf_size, "%s", it->first.c_str());
  3419. }
  3420. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  3421. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  3422. if (buf_size > 0) {
  3423. buf[0] = '\0';
  3424. }
  3425. return -1;
  3426. }
  3427. auto it = model->gguf_kv.begin();
  3428. std::advance(it, i);
  3429. return snprintf(buf, buf_size, "%s", it->second.c_str());
  3430. }
  3431. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  3432. return snprintf(buf, buf_size, "%s", model->desc().c_str());
  3433. }
  3434. uint64_t llama_model_size(const struct llama_model * model) {
  3435. return model->size();
  3436. }
  3437. const char * llama_model_chat_template(const struct llama_model * model, const char * name) {
  3438. const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE_N)
  3439. : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
  3440. const auto & it = model->gguf_kv.find(key);
  3441. if (it == model->gguf_kv.end()) {
  3442. return nullptr;
  3443. }
  3444. return it->second.c_str();
  3445. }
  3446. uint64_t llama_model_n_params(const struct llama_model * model) {
  3447. return model->n_elements();
  3448. }
  3449. bool llama_model_has_encoder(const struct llama_model * model) {
  3450. switch (model->arch) {
  3451. case LLM_ARCH_T5: return true;
  3452. case LLM_ARCH_T5ENCODER: return true;
  3453. default: return false;
  3454. }
  3455. }
  3456. bool llama_model_has_decoder(const struct llama_model * model) {
  3457. switch (model->arch) {
  3458. case LLM_ARCH_T5ENCODER: return false;
  3459. default: return true;
  3460. }
  3461. }
  3462. llama_token llama_model_decoder_start_token(const struct llama_model * model) {
  3463. return model->hparams.dec_start_token_id;
  3464. }
  3465. bool llama_model_is_recurrent(const struct llama_model * model) {
  3466. switch (model->arch) {
  3467. case LLM_ARCH_MAMBA: return true;
  3468. case LLM_ARCH_RWKV6: return true;
  3469. case LLM_ARCH_RWKV6QWEN2: return true;
  3470. default: return false;
  3471. }
  3472. }