llama.cpp 873 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546165471654816549165501655116552165531655416555165561655716558165591656016561165621656316564165651656616567165681656916570165711657216573165741657516576165771657816579165801658116582165831658416585165861658716588165891659016591165921659316594165951659616597165981659916600166011660216603166041660516606166071660816609166101661116612166131661416615166161661716618166191662016621166221662316624166251662616627166281662916630166311663216633166341663516636166371663816639166401664116642166431664416645166461664716648166491665016651166521665316654166551665616657166581665916660166611666216663166641666516666166671666816669166701667116672166731667416675166761667716678166791668016681166821668316684166851668616687166881668916690166911669216693166941669516696166971669816699167001670116702167031670416705167061670716708167091671016711167121671316714167151671616717167181671916720167211672216723167241672516726167271672816729167301673116732167331673416735167361673716738167391674016741167421674316744167451674616747167481674916750167511675216753167541675516756167571675816759167601676116762167631676416765167661676716768167691677016771167721677316774167751677616777167781677916780167811678216783167841678516786167871678816789167901679116792167931679416795167961679716798167991680016801168021680316804168051680616807168081680916810168111681216813168141681516816168171681816819168201682116822168231682416825168261682716828168291683016831168321683316834168351683616837168381683916840168411684216843168441684516846168471684816849168501685116852168531685416855168561685716858168591686016861168621686316864168651686616867168681686916870168711687216873168741687516876168771687816879168801688116882168831688416885168861688716888168891689016891168921689316894168951689616897168981689916900169011690216903169041690516906169071690816909169101691116912169131691416915169161691716918169191692016921169221692316924169251692616927169281692916930169311693216933169341693516936169371693816939169401694116942169431694416945169461694716948169491695016951169521695316954169551695616957169581695916960169611696216963169641696516966169671696816969169701697116972169731697416975169761697716978169791698016981169821698316984169851698616987169881698916990169911699216993169941699516996169971699816999170001700117002170031700417005170061700717008170091701017011170121701317014170151701617017170181701917020170211702217023170241702517026170271702817029170301703117032170331703417035170361703717038170391704017041170421704317044170451704617047170481704917050170511705217053170541705517056170571705817059170601706117062170631706417065170661706717068170691707017071170721707317074170751707617077170781707917080170811708217083170841708517086170871708817089170901709117092170931709417095170961709717098170991710017101171021710317104171051710617107171081710917110171111711217113171141711517116171171711817119171201712117122171231712417125171261712717128171291713017131171321713317134171351713617137171381713917140171411714217143171441714517146171471714817149171501715117152171531715417155171561715717158171591716017161171621716317164171651716617167171681716917170171711717217173171741717517176171771717817179171801718117182171831718417185171861718717188171891719017191171921719317194171951719617197171981719917200172011720217203172041720517206172071720817209172101721117212172131721417215172161721717218172191722017221172221722317224172251722617227172281722917230172311723217233172341723517236172371723817239172401724117242172431724417245172461724717248172491725017251172521725317254172551725617257172581725917260172611726217263172641726517266172671726817269172701727117272172731727417275172761727717278172791728017281172821728317284172851728617287172881728917290172911729217293172941729517296172971729817299173001730117302173031730417305173061730717308173091731017311173121731317314173151731617317173181731917320173211732217323173241732517326173271732817329173301733117332173331733417335173361733717338173391734017341173421734317344173451734617347173481734917350173511735217353173541735517356173571735817359173601736117362173631736417365173661736717368173691737017371173721737317374173751737617377173781737917380173811738217383173841738517386173871738817389173901739117392173931739417395173961739717398173991740017401174021740317404174051740617407174081740917410174111741217413174141741517416174171741817419174201742117422174231742417425174261742717428174291743017431174321743317434174351743617437174381743917440174411744217443174441744517446174471744817449174501745117452174531745417455174561745717458174591746017461174621746317464174651746617467174681746917470174711747217473174741747517476174771747817479174801748117482174831748417485174861748717488174891749017491174921749317494174951749617497174981749917500175011750217503175041750517506175071750817509175101751117512175131751417515175161751717518175191752017521175221752317524175251752617527175281752917530175311753217533175341753517536175371753817539175401754117542175431754417545175461754717548175491755017551175521755317554175551755617557175581755917560175611756217563175641756517566175671756817569175701757117572175731757417575175761757717578175791758017581175821758317584175851758617587175881758917590175911759217593175941759517596175971759817599176001760117602176031760417605176061760717608176091761017611176121761317614176151761617617176181761917620176211762217623176241762517626176271762817629176301763117632176331763417635176361763717638176391764017641176421764317644176451764617647176481764917650176511765217653176541765517656176571765817659176601766117662176631766417665176661766717668176691767017671176721767317674176751767617677176781767917680176811768217683176841768517686176871768817689176901769117692176931769417695176961769717698176991770017701177021770317704177051770617707177081770917710177111771217713177141771517716177171771817719177201772117722177231772417725177261772717728177291773017731177321773317734177351773617737177381773917740177411774217743177441774517746177471774817749177501775117752177531775417755177561775717758177591776017761177621776317764177651776617767177681776917770177711777217773177741777517776177771777817779177801778117782177831778417785177861778717788177891779017791177921779317794177951779617797177981779917800178011780217803178041780517806178071780817809178101781117812178131781417815178161781717818178191782017821178221782317824178251782617827178281782917830178311783217833178341783517836178371783817839178401784117842178431784417845178461784717848178491785017851178521785317854178551785617857178581785917860178611786217863178641786517866178671786817869178701787117872178731787417875178761787717878178791788017881178821788317884178851788617887178881788917890178911789217893178941789517896178971789817899179001790117902179031790417905179061790717908179091791017911179121791317914179151791617917179181791917920179211792217923179241792517926179271792817929179301793117932179331793417935179361793717938179391794017941179421794317944179451794617947179481794917950179511795217953179541795517956179571795817959179601796117962179631796417965179661796717968179691797017971179721797317974179751797617977179781797917980179811798217983179841798517986179871798817989179901799117992179931799417995179961799717998179991800018001180021800318004180051800618007180081800918010180111801218013180141801518016180171801818019180201802118022180231802418025180261802718028180291803018031180321803318034180351803618037180381803918040180411804218043180441804518046180471804818049180501805118052180531805418055180561805718058180591806018061180621806318064180651806618067180681806918070180711807218073180741807518076180771807818079180801808118082180831808418085180861808718088180891809018091180921809318094180951809618097180981809918100181011810218103181041810518106181071810818109181101811118112181131811418115181161811718118181191812018121181221812318124181251812618127181281812918130181311813218133181341813518136181371813818139181401814118142181431814418145181461814718148181491815018151181521815318154181551815618157181581815918160181611816218163181641816518166181671816818169181701817118172181731817418175181761817718178181791818018181181821818318184181851818618187181881818918190181911819218193181941819518196181971819818199182001820118202182031820418205182061820718208182091821018211182121821318214182151821618217182181821918220182211822218223182241822518226182271822818229182301823118232182331823418235182361823718238182391824018241182421824318244182451824618247182481824918250182511825218253182541825518256182571825818259182601826118262182631826418265182661826718268182691827018271182721827318274182751827618277182781827918280182811828218283182841828518286182871828818289182901829118292182931829418295182961829718298182991830018301183021830318304183051830618307183081830918310183111831218313183141831518316183171831818319183201832118322183231832418325183261832718328183291833018331183321833318334183351833618337183381833918340183411834218343183441834518346183471834818349183501835118352183531835418355183561835718358183591836018361183621836318364183651836618367183681836918370183711837218373183741837518376183771837818379183801838118382183831838418385183861838718388183891839018391183921839318394183951839618397183981839918400184011840218403184041840518406184071840818409184101841118412184131841418415184161841718418184191842018421184221842318424184251842618427184281842918430184311843218433184341843518436184371843818439184401844118442184431844418445184461844718448184491845018451184521845318454184551845618457184581845918460184611846218463184641846518466184671846818469184701847118472184731847418475184761847718478184791848018481184821848318484184851848618487184881848918490184911849218493184941849518496184971849818499185001850118502185031850418505185061850718508185091851018511185121851318514185151851618517185181851918520185211852218523185241852518526185271852818529185301853118532185331853418535185361853718538185391854018541185421854318544185451854618547185481854918550185511855218553185541855518556185571855818559185601856118562185631856418565185661856718568185691857018571185721857318574185751857618577185781857918580185811858218583185841858518586185871858818589185901859118592185931859418595185961859718598185991860018601186021860318604186051860618607186081860918610186111861218613186141861518616186171861818619186201862118622186231862418625186261862718628186291863018631186321863318634186351863618637186381863918640186411864218643186441864518646186471864818649186501865118652186531865418655186561865718658186591866018661186621866318664186651866618667186681866918670186711867218673186741867518676186771867818679186801868118682186831868418685186861868718688186891869018691186921869318694186951869618697186981869918700187011870218703187041870518706187071870818709187101871118712187131871418715187161871718718187191872018721187221872318724187251872618727187281872918730187311873218733187341873518736187371873818739187401874118742187431874418745187461874718748187491875018751187521875318754187551875618757187581875918760187611876218763187641876518766187671876818769187701877118772187731877418775187761877718778187791878018781187821878318784187851878618787187881878918790187911879218793187941879518796187971879818799188001880118802188031880418805188061880718808188091881018811188121881318814188151881618817188181881918820188211882218823188241882518826188271882818829188301883118832188331883418835188361883718838188391884018841188421884318844188451884618847188481884918850188511885218853188541885518856188571885818859188601886118862188631886418865188661886718868188691887018871188721887318874188751887618877188781887918880188811888218883188841888518886188871888818889188901889118892188931889418895188961889718898188991890018901189021890318904189051890618907189081890918910189111891218913189141891518916189171891818919189201892118922189231892418925189261892718928189291893018931189321893318934189351893618937189381893918940189411894218943189441894518946189471894818949189501895118952189531895418955189561895718958189591896018961189621896318964189651896618967189681896918970189711897218973189741897518976189771897818979189801898118982189831898418985189861898718988189891899018991189921899318994189951899618997189981899919000190011900219003190041900519006190071900819009190101901119012190131901419015190161901719018190191902019021190221902319024190251902619027190281902919030190311903219033190341903519036190371903819039190401904119042190431904419045190461904719048190491905019051190521905319054190551905619057190581905919060190611906219063190641906519066190671906819069190701907119072190731907419075190761907719078190791908019081190821908319084190851908619087190881908919090190911909219093190941909519096190971909819099191001910119102191031910419105191061910719108191091911019111191121911319114191151911619117191181911919120191211912219123191241912519126191271912819129191301913119132191331913419135191361913719138191391914019141191421914319144191451914619147191481914919150191511915219153191541915519156191571915819159191601916119162191631916419165191661916719168191691917019171191721917319174191751917619177191781917919180191811918219183191841918519186191871918819189191901919119192191931919419195191961919719198191991920019201192021920319204192051920619207192081920919210192111921219213192141921519216192171921819219192201922119222192231922419225192261922719228192291923019231192321923319234192351923619237192381923919240192411924219243192441924519246192471924819249192501925119252192531925419255192561925719258192591926019261192621926319264192651926619267192681926919270192711927219273192741927519276192771927819279192801928119282192831928419285192861928719288192891929019291192921929319294192951929619297192981929919300193011930219303193041930519306193071930819309193101931119312193131931419315193161931719318193191932019321193221932319324193251932619327193281932919330193311933219333193341933519336193371933819339193401934119342193431934419345193461934719348193491935019351193521935319354193551935619357193581935919360193611936219363193641936519366193671936819369193701937119372193731937419375193761937719378193791938019381193821938319384193851938619387193881938919390193911939219393193941939519396193971939819399194001940119402194031940419405194061940719408194091941019411194121941319414194151941619417194181941919420194211942219423194241942519426194271942819429194301943119432194331943419435194361943719438194391944019441194421944319444194451944619447194481944919450194511945219453194541945519456194571945819459194601946119462194631946419465194661946719468194691947019471194721947319474194751947619477194781947919480194811948219483194841948519486194871948819489194901949119492194931949419495194961949719498194991950019501195021950319504195051950619507195081950919510195111951219513195141951519516195171951819519195201952119522195231952419525195261952719528195291953019531195321953319534195351953619537195381953919540195411954219543195441954519546195471954819549195501955119552195531955419555195561955719558195591956019561195621956319564195651956619567195681956919570195711957219573195741957519576195771957819579195801958119582195831958419585195861958719588195891959019591195921959319594195951959619597195981959919600196011960219603196041960519606196071960819609196101961119612196131961419615196161961719618196191962019621196221962319624196251962619627196281962919630196311963219633196341963519636196371963819639196401964119642196431964419645196461964719648196491965019651196521965319654196551965619657196581965919660196611966219663196641966519666196671966819669196701967119672196731967419675196761967719678196791968019681196821968319684196851968619687196881968919690196911969219693196941969519696196971969819699197001970119702197031970419705197061970719708197091971019711197121971319714197151971619717197181971919720197211972219723197241972519726197271972819729197301973119732197331973419735197361973719738197391974019741197421974319744197451974619747197481974919750197511975219753197541975519756197571975819759197601976119762197631976419765197661976719768197691977019771197721977319774197751977619777197781977919780197811978219783197841978519786197871978819789197901979119792197931979419795197961979719798197991980019801198021980319804198051980619807198081980919810198111981219813198141981519816198171981819819198201982119822198231982419825198261982719828198291983019831198321983319834198351983619837198381983919840198411984219843198441984519846198471984819849198501985119852198531985419855198561985719858198591986019861198621986319864198651986619867198681986919870198711987219873198741987519876198771987819879198801988119882198831988419885198861988719888198891989019891198921989319894198951989619897198981989919900199011990219903199041990519906199071990819909199101991119912199131991419915199161991719918199191992019921199221992319924199251992619927199281992919930199311993219933199341993519936199371993819939199401994119942199431994419945199461994719948199491995019951199521995319954199551995619957199581995919960199611996219963199641996519966199671996819969199701997119972199731997419975199761997719978199791998019981199821998319984199851998619987199881998919990199911999219993199941999519996199971999819999200002000120002200032000420005200062000720008200092001020011200122001320014200152001620017200182001920020200212002220023200242002520026200272002820029200302003120032200332003420035200362003720038200392004020041200422004320044200452004620047200482004920050200512005220053200542005520056200572005820059200602006120062200632006420065200662006720068200692007020071200722007320074200752007620077200782007920080200812008220083200842008520086200872008820089200902009120092200932009420095200962009720098200992010020101201022010320104201052010620107201082010920110201112011220113201142011520116201172011820119201202012120122201232012420125201262012720128201292013020131201322013320134201352013620137201382013920140201412014220143201442014520146201472014820149201502015120152201532015420155201562015720158201592016020161201622016320164201652016620167201682016920170201712017220173201742017520176201772017820179201802018120182201832018420185201862018720188201892019020191201922019320194201952019620197201982019920200202012020220203202042020520206202072020820209202102021120212202132021420215202162021720218202192022020221202222022320224202252022620227202282022920230202312023220233202342023520236202372023820239202402024120242202432024420245202462024720248202492025020251202522025320254202552025620257202582025920260202612026220263202642026520266202672026820269202702027120272202732027420275202762027720278202792028020281202822028320284202852028620287202882028920290202912029220293202942029520296202972029820299203002030120302203032030420305203062030720308203092031020311203122031320314203152031620317203182031920320203212032220323203242032520326203272032820329203302033120332203332033420335203362033720338203392034020341203422034320344203452034620347203482034920350203512035220353203542035520356203572035820359203602036120362203632036420365203662036720368203692037020371203722037320374203752037620377203782037920380203812038220383203842038520386203872038820389203902039120392203932039420395203962039720398203992040020401204022040320404204052040620407204082040920410204112041220413204142041520416204172041820419204202042120422204232042420425204262042720428204292043020431204322043320434204352043620437204382043920440204412044220443204442044520446204472044820449204502045120452204532045420455204562045720458204592046020461204622046320464204652046620467204682046920470204712047220473204742047520476204772047820479204802048120482204832048420485204862048720488204892049020491204922049320494204952049620497204982049920500205012050220503205042050520506205072050820509205102051120512205132051420515205162051720518205192052020521205222052320524205252052620527205282052920530205312053220533205342053520536205372053820539205402054120542205432054420545205462054720548205492055020551205522055320554205552055620557205582055920560205612056220563205642056520566205672056820569205702057120572205732057420575205762057720578205792058020581205822058320584205852058620587205882058920590205912059220593205942059520596205972059820599206002060120602206032060420605206062060720608206092061020611206122061320614206152061620617206182061920620206212062220623206242062520626206272062820629206302063120632206332063420635206362063720638206392064020641206422064320644206452064620647206482064920650206512065220653206542065520656206572065820659206602066120662206632066420665206662066720668206692067020671206722067320674206752067620677206782067920680206812068220683206842068520686206872068820689206902069120692206932069420695206962069720698206992070020701207022070320704207052070620707207082070920710207112071220713207142071520716207172071820719207202072120722207232072420725207262072720728207292073020731207322073320734207352073620737207382073920740207412074220743207442074520746207472074820749207502075120752207532075420755207562075720758207592076020761207622076320764207652076620767207682076920770207712077220773207742077520776207772077820779207802078120782207832078420785207862078720788207892079020791207922079320794207952079620797207982079920800208012080220803208042080520806208072080820809208102081120812208132081420815208162081720818208192082020821208222082320824208252082620827208282082920830208312083220833208342083520836208372083820839208402084120842208432084420845208462084720848208492085020851208522085320854208552085620857208582085920860208612086220863208642086520866208672086820869208702087120872208732087420875208762087720878208792088020881208822088320884208852088620887208882088920890208912089220893208942089520896208972089820899209002090120902209032090420905209062090720908209092091020911209122091320914209152091620917209182091920920209212092220923209242092520926209272092820929209302093120932209332093420935209362093720938209392094020941209422094320944209452094620947209482094920950209512095220953209542095520956209572095820959209602096120962209632096420965209662096720968209692097020971209722097320974209752097620977209782097920980209812098220983209842098520986209872098820989209902099120992209932099420995209962099720998209992100021001210022100321004210052100621007210082100921010210112101221013210142101521016210172101821019210202102121022210232102421025210262102721028210292103021031210322103321034210352103621037210382103921040210412104221043210442104521046210472104821049210502105121052210532105421055210562105721058210592106021061210622106321064210652106621067210682106921070210712107221073210742107521076210772107821079210802108121082210832108421085210862108721088210892109021091210922109321094210952109621097210982109921100211012110221103211042110521106211072110821109211102111121112211132111421115
  1. /**
  2. * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "llama-impl.h"
  27. #include "llama-vocab.h"
  28. #include "llama-grammar.h"
  29. #include "llama-sampling.h"
  30. #include "unicode.h"
  31. #include "ggml.h"
  32. #include "ggml-alloc.h"
  33. #include "ggml-backend.h"
  34. #ifdef GGML_USE_RPC
  35. # include "ggml-rpc.h"
  36. #endif
  37. #ifdef GGML_USE_CUDA
  38. # include "ggml-cuda.h"
  39. #elif defined(GGML_USE_VULKAN)
  40. # include "ggml-vulkan.h"
  41. #elif defined(GGML_USE_SYCL)
  42. # include "ggml-sycl.h"
  43. #elif defined(GGML_USE_KOMPUTE)
  44. # include "ggml-kompute.h"
  45. #elif defined(GGML_USE_CANN)
  46. # include "ggml-cann.h"
  47. #endif
  48. #ifdef GGML_USE_BLAS
  49. # include "ggml-blas.h"
  50. #endif
  51. #ifdef GGML_USE_METAL
  52. # include "ggml-metal.h"
  53. #endif
  54. // TODO: replace with ggml API call
  55. #define QK_K 256
  56. #ifdef __has_include
  57. #if __has_include(<unistd.h>)
  58. #include <unistd.h>
  59. #if defined(_POSIX_MAPPED_FILES)
  60. #include <sys/mman.h>
  61. #include <fcntl.h>
  62. #endif
  63. #if defined(_POSIX_MEMLOCK_RANGE)
  64. #include <sys/resource.h>
  65. #endif
  66. #endif
  67. #endif
  68. #if defined(_WIN32)
  69. #define WIN32_LEAN_AND_MEAN
  70. #ifndef NOMINMAX
  71. #define NOMINMAX
  72. #endif
  73. #include <windows.h>
  74. #ifndef PATH_MAX
  75. #define PATH_MAX MAX_PATH
  76. #endif
  77. #include <io.h>
  78. #endif
  79. #if __cplusplus >= 202000L
  80. #define LU8(x) (const char*)(u8##x)
  81. #else
  82. #define LU8(x) u8##x
  83. #endif
  84. #include <algorithm>
  85. #include <array>
  86. #include <cassert>
  87. #include <cctype>
  88. #include <cfloat>
  89. #include <cinttypes>
  90. #include <climits>
  91. #include <cmath>
  92. #include <cstdarg>
  93. #include <cstddef>
  94. #include <cstdint>
  95. #include <cstdio>
  96. #include <cstring>
  97. #include <ctime>
  98. #include <fstream>
  99. #include <functional>
  100. #include <future>
  101. #include <initializer_list>
  102. #include <locale>
  103. #include <map>
  104. #include <memory>
  105. #include <mutex>
  106. #include <numeric>
  107. #include <set>
  108. #include <sstream>
  109. #include <thread>
  110. #include <type_traits>
  111. #include <unordered_map>
  112. #if defined(_MSC_VER)
  113. #pragma warning(disable: 4244 4267) // possible loss of data
  114. #endif
  115. // bump if necessary
  116. #define LLAMA_MAX_LAYERS 512
  117. #define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
  118. //
  119. // helpers
  120. //
  121. // trim whitespace from the beginning and end of a string
  122. static std::string trim(const std::string & str) {
  123. size_t start = 0;
  124. size_t end = str.size();
  125. while (start < end && isspace(str[start])) {
  126. start += 1;
  127. }
  128. while (end > start && isspace(str[end - 1])) {
  129. end -= 1;
  130. }
  131. return str.substr(start, end - start);
  132. }
  133. static bool is_float_close(float a, float b, float abs_tol) {
  134. // Check for non-negative tolerance
  135. if (abs_tol < 0.0) {
  136. throw std::invalid_argument("Tolerance must be non-negative");
  137. }
  138. // Exact equality check
  139. if (a == b) {
  140. return true;
  141. }
  142. // Check for infinities
  143. if (std::isinf(a) || std::isinf(b)) {
  144. return false;
  145. }
  146. // Regular comparison using the provided absolute tolerance
  147. return std::fabs(b - a) <= abs_tol;
  148. }
  149. static void zeros(std::ofstream & file, size_t n) {
  150. char zero = 0;
  151. for (size_t i = 0; i < n; ++i) {
  152. file.write(&zero, 1);
  153. }
  154. }
  155. LLAMA_ATTRIBUTE_FORMAT(1, 2)
  156. static std::string format(const char * fmt, ...) {
  157. va_list ap;
  158. va_list ap2;
  159. va_start(ap, fmt);
  160. va_copy(ap2, ap);
  161. int size = vsnprintf(NULL, 0, fmt, ap);
  162. GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
  163. std::vector<char> buf(size + 1);
  164. int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
  165. GGML_ASSERT(size2 == size);
  166. va_end(ap2);
  167. va_end(ap);
  168. return std::string(buf.data(), size);
  169. }
  170. //
  171. // gguf constants (sync with gguf.py)
  172. //
  173. enum llm_arch {
  174. LLM_ARCH_LLAMA,
  175. LLM_ARCH_FALCON,
  176. LLM_ARCH_BAICHUAN,
  177. LLM_ARCH_GROK,
  178. LLM_ARCH_GPT2,
  179. LLM_ARCH_GPTJ,
  180. LLM_ARCH_GPTNEOX,
  181. LLM_ARCH_MPT,
  182. LLM_ARCH_STARCODER,
  183. LLM_ARCH_REFACT,
  184. LLM_ARCH_BERT,
  185. LLM_ARCH_NOMIC_BERT,
  186. LLM_ARCH_JINA_BERT_V2,
  187. LLM_ARCH_BLOOM,
  188. LLM_ARCH_STABLELM,
  189. LLM_ARCH_QWEN,
  190. LLM_ARCH_QWEN2,
  191. LLM_ARCH_QWEN2MOE,
  192. LLM_ARCH_PHI2,
  193. LLM_ARCH_PHI3,
  194. LLM_ARCH_PLAMO,
  195. LLM_ARCH_CODESHELL,
  196. LLM_ARCH_ORION,
  197. LLM_ARCH_INTERNLM2,
  198. LLM_ARCH_MINICPM,
  199. LLM_ARCH_GEMMA,
  200. LLM_ARCH_GEMMA2,
  201. LLM_ARCH_STARCODER2,
  202. LLM_ARCH_MAMBA,
  203. LLM_ARCH_XVERSE,
  204. LLM_ARCH_COMMAND_R,
  205. LLM_ARCH_DBRX,
  206. LLM_ARCH_OLMO,
  207. LLM_ARCH_OPENELM,
  208. LLM_ARCH_ARCTIC,
  209. LLM_ARCH_DEEPSEEK2,
  210. LLM_ARCH_CHATGLM,
  211. LLM_ARCH_BITNET,
  212. LLM_ARCH_T5,
  213. LLM_ARCH_T5ENCODER,
  214. LLM_ARCH_JAIS,
  215. LLM_ARCH_NEMOTRON,
  216. LLM_ARCH_EXAONE,
  217. LLM_ARCH_RWKV6,
  218. LLM_ARCH_SOLAR,
  219. LLM_ARCH_UNKNOWN,
  220. };
  221. static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
  222. { LLM_ARCH_LLAMA, "llama" },
  223. { LLM_ARCH_FALCON, "falcon" },
  224. { LLM_ARCH_GROK, "grok" },
  225. { LLM_ARCH_GPT2, "gpt2" },
  226. { LLM_ARCH_GPTJ, "gptj" },
  227. { LLM_ARCH_GPTNEOX, "gptneox" },
  228. { LLM_ARCH_MPT, "mpt" },
  229. { LLM_ARCH_BAICHUAN, "baichuan" },
  230. { LLM_ARCH_STARCODER, "starcoder" },
  231. { LLM_ARCH_REFACT, "refact" },
  232. { LLM_ARCH_BERT, "bert" },
  233. { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
  234. { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
  235. { LLM_ARCH_BLOOM, "bloom" },
  236. { LLM_ARCH_STABLELM, "stablelm" },
  237. { LLM_ARCH_QWEN, "qwen" },
  238. { LLM_ARCH_QWEN2, "qwen2" },
  239. { LLM_ARCH_QWEN2MOE, "qwen2moe" },
  240. { LLM_ARCH_PHI2, "phi2" },
  241. { LLM_ARCH_PHI3, "phi3" },
  242. { LLM_ARCH_PLAMO, "plamo" },
  243. { LLM_ARCH_CODESHELL, "codeshell" },
  244. { LLM_ARCH_ORION, "orion" },
  245. { LLM_ARCH_INTERNLM2, "internlm2" },
  246. { LLM_ARCH_MINICPM, "minicpm" },
  247. { LLM_ARCH_GEMMA, "gemma" },
  248. { LLM_ARCH_GEMMA2, "gemma2" },
  249. { LLM_ARCH_STARCODER2, "starcoder2" },
  250. { LLM_ARCH_MAMBA, "mamba" },
  251. { LLM_ARCH_XVERSE, "xverse" },
  252. { LLM_ARCH_COMMAND_R, "command-r" },
  253. { LLM_ARCH_DBRX, "dbrx" },
  254. { LLM_ARCH_OLMO, "olmo" },
  255. { LLM_ARCH_OPENELM, "openelm" },
  256. { LLM_ARCH_ARCTIC, "arctic" },
  257. { LLM_ARCH_DEEPSEEK2, "deepseek2" },
  258. { LLM_ARCH_CHATGLM, "chatglm" },
  259. { LLM_ARCH_BITNET, "bitnet" },
  260. { LLM_ARCH_T5, "t5" },
  261. { LLM_ARCH_T5ENCODER, "t5encoder" },
  262. { LLM_ARCH_JAIS, "jais" },
  263. { LLM_ARCH_NEMOTRON, "nemotron" },
  264. { LLM_ARCH_EXAONE, "exaone" },
  265. { LLM_ARCH_RWKV6, "rwkv6" },
  266. { LLM_ARCH_SOLAR, "solar" },
  267. { LLM_ARCH_UNKNOWN, "(unknown)" },
  268. };
  269. enum llm_kv {
  270. LLM_KV_GENERAL_TYPE,
  271. LLM_KV_GENERAL_ARCHITECTURE,
  272. LLM_KV_GENERAL_QUANTIZATION_VERSION,
  273. LLM_KV_GENERAL_ALIGNMENT,
  274. LLM_KV_GENERAL_NAME,
  275. LLM_KV_GENERAL_AUTHOR,
  276. LLM_KV_GENERAL_VERSION,
  277. LLM_KV_GENERAL_URL,
  278. LLM_KV_GENERAL_DESCRIPTION,
  279. LLM_KV_GENERAL_LICENSE,
  280. LLM_KV_GENERAL_SOURCE_URL,
  281. LLM_KV_GENERAL_SOURCE_HF_REPO,
  282. LLM_KV_VOCAB_SIZE,
  283. LLM_KV_CONTEXT_LENGTH,
  284. LLM_KV_EMBEDDING_LENGTH,
  285. LLM_KV_BLOCK_COUNT,
  286. LLM_KV_LEADING_DENSE_BLOCK_COUNT,
  287. LLM_KV_FEED_FORWARD_LENGTH,
  288. LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
  289. LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
  290. LLM_KV_USE_PARALLEL_RESIDUAL,
  291. LLM_KV_TENSOR_DATA_LAYOUT,
  292. LLM_KV_EXPERT_COUNT,
  293. LLM_KV_EXPERT_USED_COUNT,
  294. LLM_KV_EXPERT_SHARED_COUNT,
  295. LLM_KV_EXPERT_WEIGHTS_SCALE,
  296. LLM_KV_POOLING_TYPE,
  297. LLM_KV_LOGIT_SCALE,
  298. LLM_KV_DECODER_START_TOKEN_ID,
  299. LLM_KV_ATTN_LOGIT_SOFTCAPPING,
  300. LLM_KV_FINAL_LOGIT_SOFTCAPPING,
  301. LLM_KV_RESCALE_EVERY_N_LAYERS,
  302. LLM_KV_TIME_MIX_EXTRA_DIM,
  303. LLM_KV_TIME_DECAY_EXTRA_DIM,
  304. LLM_KV_ATTENTION_HEAD_COUNT,
  305. LLM_KV_ATTENTION_HEAD_COUNT_KV,
  306. LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
  307. LLM_KV_ATTENTION_CLAMP_KQV,
  308. LLM_KV_ATTENTION_KEY_LENGTH,
  309. LLM_KV_ATTENTION_VALUE_LENGTH,
  310. LLM_KV_ATTENTION_LAYERNORM_EPS,
  311. LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  312. LLM_KV_ATTENTION_CAUSAL,
  313. LLM_KV_ATTENTION_Q_LORA_RANK,
  314. LLM_KV_ATTENTION_KV_LORA_RANK,
  315. LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
  316. LLM_KV_ATTENTION_SLIDING_WINDOW,
  317. LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION,
  318. LLM_KV_ROPE_DIMENSION_COUNT,
  319. LLM_KV_ROPE_FREQ_BASE,
  320. LLM_KV_ROPE_SCALE_LINEAR,
  321. LLM_KV_ROPE_SCALING_TYPE,
  322. LLM_KV_ROPE_SCALING_FACTOR,
  323. LLM_KV_ROPE_SCALING_ATTN_FACTOR,
  324. LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
  325. LLM_KV_ROPE_SCALING_FINETUNED,
  326. LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
  327. LLM_KV_SPLIT_NO,
  328. LLM_KV_SPLIT_COUNT,
  329. LLM_KV_SPLIT_TENSORS_COUNT,
  330. LLM_KV_SSM_INNER_SIZE,
  331. LLM_KV_SSM_CONV_KERNEL,
  332. LLM_KV_SSM_STATE_SIZE,
  333. LLM_KV_SSM_TIME_STEP_RANK,
  334. LLM_KV_SSM_DT_B_C_RMS,
  335. LLM_KV_WKV_HEAD_SIZE,
  336. LLM_KV_TOKENIZER_MODEL,
  337. LLM_KV_TOKENIZER_PRE,
  338. LLM_KV_TOKENIZER_LIST,
  339. LLM_KV_TOKENIZER_TOKEN_TYPE,
  340. LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
  341. LLM_KV_TOKENIZER_SCORES,
  342. LLM_KV_TOKENIZER_MERGES,
  343. LLM_KV_TOKENIZER_BOS_ID,
  344. LLM_KV_TOKENIZER_EOS_ID,
  345. LLM_KV_TOKENIZER_UNK_ID,
  346. LLM_KV_TOKENIZER_SEP_ID,
  347. LLM_KV_TOKENIZER_PAD_ID,
  348. LLM_KV_TOKENIZER_CLS_ID,
  349. LLM_KV_TOKENIZER_MASK_ID,
  350. LLM_KV_TOKENIZER_ADD_BOS,
  351. LLM_KV_TOKENIZER_ADD_EOS,
  352. LLM_KV_TOKENIZER_ADD_PREFIX,
  353. LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
  354. LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
  355. LLM_KV_TOKENIZER_HF_JSON,
  356. LLM_KV_TOKENIZER_RWKV,
  357. LLM_KV_TOKENIZER_PREFIX_ID,
  358. LLM_KV_TOKENIZER_SUFFIX_ID,
  359. LLM_KV_TOKENIZER_MIDDLE_ID,
  360. LLM_KV_TOKENIZER_EOT_ID,
  361. LLM_KV_TOKENIZER_EOM_ID,
  362. LLM_KV_ADAPTER_TYPE,
  363. LLM_KV_ADAPTER_LORA_ALPHA,
  364. };
  365. static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
  366. { LLM_KV_GENERAL_TYPE, "general.type" },
  367. { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
  368. { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
  369. { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
  370. { LLM_KV_GENERAL_NAME, "general.name" },
  371. { LLM_KV_GENERAL_AUTHOR, "general.author" },
  372. { LLM_KV_GENERAL_VERSION, "general.version" },
  373. { LLM_KV_GENERAL_URL, "general.url" },
  374. { LLM_KV_GENERAL_DESCRIPTION, "general.description" },
  375. { LLM_KV_GENERAL_LICENSE, "general.license" },
  376. { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
  377. { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
  378. { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
  379. { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
  380. { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
  381. { LLM_KV_BLOCK_COUNT, "%s.block_count" },
  382. { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
  383. { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
  384. { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
  385. { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
  386. { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
  387. { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
  388. { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
  389. { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
  390. { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
  391. { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
  392. { LLM_KV_POOLING_TYPE, "%s.pooling_type" },
  393. { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
  394. { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
  395. { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
  396. { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
  397. { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
  398. { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" },
  399. { LLM_KV_TIME_DECAY_EXTRA_DIM, "%s.time_decay_extra_dim" },
  400. { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
  401. { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
  402. { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" },
  403. { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" },
  404. { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" },
  405. { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
  406. { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
  407. { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
  408. { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
  409. { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
  410. { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
  411. { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
  412. { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
  413. { LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION, "%s.attention.block_skip_connection.%d" },
  414. { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
  415. { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
  416. { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
  417. { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
  418. { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
  419. { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
  420. { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
  421. { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
  422. { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
  423. { LLM_KV_SPLIT_NO, "split.no" },
  424. { LLM_KV_SPLIT_COUNT, "split.count" },
  425. { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" },
  426. { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
  427. { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
  428. { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
  429. { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
  430. { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" },
  431. { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
  432. { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
  433. { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
  434. { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
  435. { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" },
  436. { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" },
  437. { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" },
  438. { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" },
  439. { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" },
  440. { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" },
  441. { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
  442. { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
  443. { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
  444. { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" },
  445. { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" },
  446. { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
  447. { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
  448. { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
  449. { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" },
  450. { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
  451. { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
  452. { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
  453. { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
  454. { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
  455. { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
  456. { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" },
  457. { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" },
  458. { LLM_KV_ADAPTER_TYPE, "adapter.type" },
  459. { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
  460. };
  461. struct LLM_KV {
  462. LLM_KV(llm_arch arch) : arch(arch) {}
  463. llm_arch arch;
  464. std::string operator()(llm_kv kv) const {
  465. return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
  466. }
  467. };
  468. enum llm_tensor {
  469. LLM_TENSOR_TOKEN_EMBD,
  470. LLM_TENSOR_TOKEN_EMBD_NORM,
  471. LLM_TENSOR_TOKEN_TYPES,
  472. LLM_TENSOR_POS_EMBD,
  473. LLM_TENSOR_OUTPUT,
  474. LLM_TENSOR_OUTPUT_NORM,
  475. LLM_TENSOR_ROPE_FREQS,
  476. LLM_TENSOR_ROPE_FACTORS_LONG,
  477. LLM_TENSOR_ROPE_FACTORS_SHORT,
  478. LLM_TENSOR_ATTN_Q,
  479. LLM_TENSOR_ATTN_K,
  480. LLM_TENSOR_ATTN_V,
  481. LLM_TENSOR_ATTN_QKV,
  482. LLM_TENSOR_ATTN_OUT,
  483. LLM_TENSOR_ATTN_NORM,
  484. LLM_TENSOR_ATTN_NORM_2,
  485. LLM_TENSOR_ATTN_OUT_NORM,
  486. LLM_TENSOR_ATTN_POST_NORM,
  487. LLM_TENSOR_ATTN_ROT_EMBD,
  488. LLM_TENSOR_FFN_GATE_INP,
  489. LLM_TENSOR_FFN_GATE_INP_SHEXP,
  490. LLM_TENSOR_FFN_NORM,
  491. LLM_TENSOR_FFN_POST_NORM,
  492. LLM_TENSOR_FFN_GATE,
  493. LLM_TENSOR_FFN_DOWN,
  494. LLM_TENSOR_FFN_UP,
  495. LLM_TENSOR_FFN_ACT,
  496. LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
  497. LLM_TENSOR_FFN_GATE_EXP,
  498. LLM_TENSOR_FFN_UP_EXP,
  499. LLM_TENSOR_FFN_NORM_EXPS,
  500. LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
  501. LLM_TENSOR_FFN_GATE_EXPS,
  502. LLM_TENSOR_FFN_UP_EXPS,
  503. LLM_TENSOR_FFN_DOWN_SHEXP,
  504. LLM_TENSOR_FFN_GATE_SHEXP,
  505. LLM_TENSOR_FFN_UP_SHEXP,
  506. LLM_TENSOR_ATTN_Q_NORM,
  507. LLM_TENSOR_ATTN_K_NORM,
  508. LLM_TENSOR_LAYER_OUT_NORM,
  509. LLM_TENSOR_SSM_IN,
  510. LLM_TENSOR_SSM_CONV1D,
  511. LLM_TENSOR_SSM_X,
  512. LLM_TENSOR_SSM_DT,
  513. LLM_TENSOR_SSM_A,
  514. LLM_TENSOR_SSM_D,
  515. LLM_TENSOR_SSM_OUT,
  516. LLM_TENSOR_TIME_MIX_W1,
  517. LLM_TENSOR_TIME_MIX_W2,
  518. LLM_TENSOR_TIME_MIX_LERP_X,
  519. LLM_TENSOR_TIME_MIX_LERP_W,
  520. LLM_TENSOR_TIME_MIX_LERP_K,
  521. LLM_TENSOR_TIME_MIX_LERP_V,
  522. LLM_TENSOR_TIME_MIX_LERP_R,
  523. LLM_TENSOR_TIME_MIX_LERP_G,
  524. LLM_TENSOR_TIME_MIX_FIRST,
  525. LLM_TENSOR_TIME_MIX_DECAY,
  526. LLM_TENSOR_TIME_MIX_DECAY_W1,
  527. LLM_TENSOR_TIME_MIX_DECAY_W2,
  528. LLM_TENSOR_TIME_MIX_KEY,
  529. LLM_TENSOR_TIME_MIX_VALUE,
  530. LLM_TENSOR_TIME_MIX_RECEPTANCE,
  531. LLM_TENSOR_TIME_MIX_GATE,
  532. LLM_TENSOR_TIME_MIX_LN,
  533. LLM_TENSOR_TIME_MIX_OUTPUT,
  534. LLM_TENSOR_CHANNEL_MIX_LERP_K,
  535. LLM_TENSOR_CHANNEL_MIX_LERP_R,
  536. LLM_TENSOR_CHANNEL_MIX_KEY,
  537. LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
  538. LLM_TENSOR_CHANNEL_MIX_VALUE,
  539. LLM_TENSOR_ATTN_Q_A,
  540. LLM_TENSOR_ATTN_Q_B,
  541. LLM_TENSOR_ATTN_KV_A_MQA,
  542. LLM_TENSOR_ATTN_KV_B,
  543. LLM_TENSOR_ATTN_Q_A_NORM,
  544. LLM_TENSOR_ATTN_KV_A_NORM,
  545. LLM_TENSOR_ATTN_SUB_NORM,
  546. LLM_TENSOR_FFN_SUB_NORM,
  547. LLM_TENSOR_DEC_ATTN_NORM,
  548. LLM_TENSOR_DEC_ATTN_Q,
  549. LLM_TENSOR_DEC_ATTN_K,
  550. LLM_TENSOR_DEC_ATTN_V,
  551. LLM_TENSOR_DEC_ATTN_OUT,
  552. LLM_TENSOR_DEC_ATTN_REL_B,
  553. LLM_TENSOR_DEC_CROSS_ATTN_NORM,
  554. LLM_TENSOR_DEC_CROSS_ATTN_Q,
  555. LLM_TENSOR_DEC_CROSS_ATTN_K,
  556. LLM_TENSOR_DEC_CROSS_ATTN_V,
  557. LLM_TENSOR_DEC_CROSS_ATTN_OUT,
  558. LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
  559. LLM_TENSOR_DEC_FFN_NORM,
  560. LLM_TENSOR_DEC_FFN_GATE,
  561. LLM_TENSOR_DEC_FFN_DOWN,
  562. LLM_TENSOR_DEC_FFN_UP,
  563. LLM_TENSOR_DEC_OUTPUT_NORM,
  564. LLM_TENSOR_ENC_ATTN_NORM,
  565. LLM_TENSOR_ENC_ATTN_Q,
  566. LLM_TENSOR_ENC_ATTN_K,
  567. LLM_TENSOR_ENC_ATTN_V,
  568. LLM_TENSOR_ENC_ATTN_OUT,
  569. LLM_TENSOR_ENC_ATTN_REL_B,
  570. LLM_TENSOR_ENC_FFN_NORM,
  571. LLM_TENSOR_ENC_FFN_GATE,
  572. LLM_TENSOR_ENC_FFN_DOWN,
  573. LLM_TENSOR_ENC_FFN_UP,
  574. LLM_TENSOR_ENC_OUTPUT_NORM,
  575. LLM_TENSOR_BSKCN_TV,
  576. };
  577. static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
  578. {
  579. LLM_ARCH_LLAMA,
  580. {
  581. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  582. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  583. { LLM_TENSOR_OUTPUT, "output" },
  584. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  585. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  586. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  587. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  588. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  589. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  590. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  591. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  592. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  593. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  594. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  595. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  596. { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
  597. { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
  598. { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
  599. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  600. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  601. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  602. },
  603. },
  604. {
  605. LLM_ARCH_BAICHUAN,
  606. {
  607. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  608. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  609. { LLM_TENSOR_OUTPUT, "output" },
  610. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  611. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  612. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  613. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  614. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  615. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  616. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  617. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  618. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  619. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  620. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  621. },
  622. },
  623. {
  624. LLM_ARCH_FALCON,
  625. {
  626. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  627. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  628. { LLM_TENSOR_OUTPUT, "output" },
  629. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  630. { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
  631. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  632. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  633. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  634. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  635. },
  636. },
  637. {
  638. LLM_ARCH_GROK,
  639. {
  640. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  641. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  642. { LLM_TENSOR_OUTPUT, "output" },
  643. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  644. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  645. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  646. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  647. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  648. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  649. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  650. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  651. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  652. { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
  653. { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
  654. { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
  655. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  656. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  657. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  658. { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
  659. { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
  660. },
  661. },
  662. {
  663. LLM_ARCH_GPT2,
  664. {
  665. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  666. { LLM_TENSOR_POS_EMBD, "position_embd" },
  667. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  668. { LLM_TENSOR_OUTPUT, "output" },
  669. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  670. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  671. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  672. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  673. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  674. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  675. },
  676. },
  677. {
  678. LLM_ARCH_GPTJ,
  679. {
  680. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  681. },
  682. },
  683. {
  684. LLM_ARCH_GPTNEOX,
  685. {
  686. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  687. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  688. { LLM_TENSOR_OUTPUT, "output" },
  689. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  690. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  691. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  692. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  693. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  694. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  695. },
  696. },
  697. {
  698. LLM_ARCH_MPT,
  699. {
  700. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  701. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  702. { LLM_TENSOR_OUTPUT, "output"},
  703. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  704. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  705. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  706. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  707. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  708. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  709. { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
  710. { LLM_TENSOR_POS_EMBD, "position_embd" },
  711. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
  712. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
  713. },
  714. },
  715. {
  716. LLM_ARCH_STARCODER,
  717. {
  718. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  719. { LLM_TENSOR_POS_EMBD, "position_embd" },
  720. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  721. { LLM_TENSOR_OUTPUT, "output" },
  722. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  723. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  724. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  725. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  726. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  727. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  728. },
  729. },
  730. {
  731. LLM_ARCH_REFACT,
  732. {
  733. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  734. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  735. { LLM_TENSOR_OUTPUT, "output" },
  736. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  737. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  738. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  739. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  740. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  741. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  742. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  743. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  744. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  745. },
  746. },
  747. {
  748. LLM_ARCH_BERT,
  749. {
  750. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  751. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  752. { LLM_TENSOR_TOKEN_TYPES, "token_types" },
  753. { LLM_TENSOR_POS_EMBD, "position_embd" },
  754. { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
  755. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  756. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  757. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  758. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  759. { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
  760. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  761. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  762. },
  763. },
  764. {
  765. LLM_ARCH_NOMIC_BERT,
  766. {
  767. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  768. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  769. { LLM_TENSOR_TOKEN_TYPES, "token_types" },
  770. { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
  771. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  772. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  773. { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
  774. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  775. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  776. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  777. },
  778. },
  779. {
  780. LLM_ARCH_JINA_BERT_V2,
  781. {
  782. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  783. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  784. { LLM_TENSOR_TOKEN_TYPES, "token_types" },
  785. { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
  786. { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
  787. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  788. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
  789. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  790. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
  791. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  792. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  793. { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
  794. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  795. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  796. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  797. },
  798. },
  799. {
  800. LLM_ARCH_BLOOM,
  801. {
  802. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  803. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  804. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  805. { LLM_TENSOR_OUTPUT, "output" },
  806. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  807. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  808. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  809. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  810. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  811. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  812. },
  813. },
  814. {
  815. LLM_ARCH_STABLELM,
  816. {
  817. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  818. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  819. { LLM_TENSOR_OUTPUT, "output" },
  820. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  821. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  822. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  823. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  824. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  825. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  826. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  827. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  828. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  829. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  830. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
  831. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
  832. },
  833. },
  834. {
  835. LLM_ARCH_QWEN,
  836. {
  837. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  838. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  839. { LLM_TENSOR_OUTPUT, "output" },
  840. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  841. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  842. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  843. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  844. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  845. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  846. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  847. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  848. },
  849. },
  850. {
  851. LLM_ARCH_QWEN2,
  852. {
  853. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  854. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  855. { LLM_TENSOR_OUTPUT, "output" },
  856. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  857. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  858. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  859. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  860. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  861. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  862. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  863. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  864. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  865. },
  866. },
  867. {
  868. LLM_ARCH_QWEN2MOE,
  869. {
  870. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  871. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  872. { LLM_TENSOR_OUTPUT, "output" },
  873. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  874. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  875. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  876. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  877. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  878. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  879. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  880. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  881. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  882. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  883. { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
  884. { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
  885. { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
  886. { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
  887. },
  888. },
  889. {
  890. LLM_ARCH_PHI2,
  891. {
  892. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  893. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  894. { LLM_TENSOR_OUTPUT, "output" },
  895. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  896. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  897. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  898. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  899. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  900. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  901. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  902. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  903. },
  904. },
  905. {
  906. LLM_ARCH_PHI3,
  907. {
  908. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  909. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  910. { LLM_TENSOR_OUTPUT, "output" },
  911. { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
  912. { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
  913. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  914. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  915. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  916. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  917. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  918. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  919. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  920. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  921. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  922. },
  923. },
  924. {
  925. LLM_ARCH_PLAMO,
  926. {
  927. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  928. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  929. { LLM_TENSOR_OUTPUT, "output" },
  930. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  931. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  932. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  933. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  934. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  935. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  936. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  937. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  938. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  939. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  940. },
  941. },
  942. {
  943. LLM_ARCH_CODESHELL,
  944. {
  945. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  946. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  947. { LLM_TENSOR_OUTPUT, "output" },
  948. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  949. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  950. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  951. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  952. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  953. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  954. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  955. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  956. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  957. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  958. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  959. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  960. },
  961. },
  962. {
  963. LLM_ARCH_ORION,
  964. {
  965. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  966. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  967. { LLM_TENSOR_OUTPUT, "output" },
  968. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  969. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  970. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  971. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  972. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  973. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  974. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  975. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  976. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  977. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  978. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  979. },
  980. },
  981. {
  982. LLM_ARCH_INTERNLM2,
  983. {
  984. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  985. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  986. { LLM_TENSOR_OUTPUT, "output" },
  987. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  988. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  989. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  990. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  991. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  992. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  993. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  994. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  995. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  996. },
  997. },
  998. {
  999. LLM_ARCH_MINICPM,
  1000. {
  1001. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1002. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1003. { LLM_TENSOR_OUTPUT, "output" },
  1004. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1005. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1006. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1007. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1008. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1009. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1010. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  1011. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  1012. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1013. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1014. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1015. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1016. { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
  1017. { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
  1018. { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
  1019. },
  1020. },
  1021. {
  1022. LLM_ARCH_GEMMA,
  1023. {
  1024. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1025. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1026. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1027. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1028. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1029. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1030. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1031. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1032. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1033. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1034. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1035. },
  1036. },
  1037. {
  1038. LLM_ARCH_GEMMA2,
  1039. {
  1040. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1041. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1042. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1043. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1044. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1045. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1046. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1047. { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
  1048. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1049. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1050. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1051. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1052. { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
  1053. },
  1054. },
  1055. {
  1056. LLM_ARCH_STARCODER2,
  1057. {
  1058. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1059. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1060. { LLM_TENSOR_OUTPUT, "output" },
  1061. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1062. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1063. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1064. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1065. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1066. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1067. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  1068. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1069. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1070. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1071. },
  1072. },
  1073. {
  1074. LLM_ARCH_MAMBA,
  1075. {
  1076. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1077. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1078. { LLM_TENSOR_OUTPUT, "output" },
  1079. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1080. { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
  1081. { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
  1082. { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
  1083. { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
  1084. { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
  1085. { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
  1086. { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
  1087. },
  1088. },
  1089. {
  1090. LLM_ARCH_XVERSE,
  1091. {
  1092. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1093. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1094. { LLM_TENSOR_OUTPUT, "output" },
  1095. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1096. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1097. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1098. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1099. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1100. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1101. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  1102. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1103. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1104. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1105. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1106. },
  1107. },
  1108. {
  1109. LLM_ARCH_COMMAND_R,
  1110. {
  1111. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1112. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1113. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1114. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1115. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1116. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1117. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1118. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1119. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1120. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1121. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
  1122. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
  1123. },
  1124. },
  1125. {
  1126. LLM_ARCH_DBRX,
  1127. {
  1128. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1129. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1130. { LLM_TENSOR_OUTPUT, "output" },
  1131. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  1132. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1133. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1134. { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
  1135. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  1136. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  1137. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  1138. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  1139. },
  1140. },
  1141. {
  1142. LLM_ARCH_OLMO,
  1143. {
  1144. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1145. { LLM_TENSOR_OUTPUT, "output" },
  1146. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1147. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1148. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1149. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1150. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1151. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1152. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1153. },
  1154. },
  1155. {
  1156. LLM_ARCH_OPENELM,
  1157. {
  1158. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1159. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1160. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1161. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  1162. { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
  1163. { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
  1164. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1165. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1166. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1167. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1168. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1169. },
  1170. },
  1171. {
  1172. LLM_ARCH_ARCTIC,
  1173. {
  1174. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1175. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1176. { LLM_TENSOR_OUTPUT, "output" },
  1177. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1178. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1179. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1180. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1181. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1182. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  1183. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1184. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1185. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1186. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1187. { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
  1188. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  1189. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  1190. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  1191. },
  1192. },
  1193. {
  1194. LLM_ARCH_DEEPSEEK2,
  1195. {
  1196. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1197. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1198. { LLM_TENSOR_OUTPUT, "output" },
  1199. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1200. { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
  1201. { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
  1202. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1203. { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
  1204. { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
  1205. { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
  1206. { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
  1207. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1208. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1209. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1210. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1211. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1212. { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
  1213. { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
  1214. { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
  1215. { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
  1216. { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
  1217. { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
  1218. { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
  1219. { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
  1220. },
  1221. },
  1222. {
  1223. LLM_ARCH_CHATGLM,
  1224. {
  1225. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1226. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1227. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1228. { LLM_TENSOR_OUTPUT, "output" },
  1229. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1230. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  1231. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1232. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1233. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1234. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1235. },
  1236. },
  1237. {
  1238. LLM_ARCH_BITNET,
  1239. {
  1240. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1241. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1242. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1243. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1244. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1245. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1246. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1247. { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
  1248. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1249. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1250. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1251. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1252. { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
  1253. },
  1254. },
  1255. {
  1256. LLM_ARCH_T5,
  1257. {
  1258. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1259. { LLM_TENSOR_OUTPUT, "output" },
  1260. { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
  1261. { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" },
  1262. { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" },
  1263. { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" },
  1264. { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" },
  1265. { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" },
  1266. { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" },
  1267. { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" },
  1268. { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" },
  1269. { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" },
  1270. { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" },
  1271. { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" },
  1272. { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
  1273. { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" },
  1274. { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" },
  1275. { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" },
  1276. { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" },
  1277. { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
  1278. { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
  1279. { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
  1280. { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
  1281. { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
  1282. { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
  1283. { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
  1284. { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
  1285. { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
  1286. { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
  1287. { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
  1288. },
  1289. },
  1290. {
  1291. LLM_ARCH_T5ENCODER,
  1292. {
  1293. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1294. { LLM_TENSOR_OUTPUT, "output" },
  1295. { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
  1296. { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
  1297. { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
  1298. { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
  1299. { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
  1300. { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
  1301. { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
  1302. { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
  1303. { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
  1304. { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
  1305. { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
  1306. },
  1307. },
  1308. {
  1309. LLM_ARCH_JAIS,
  1310. {
  1311. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1312. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1313. { LLM_TENSOR_OUTPUT, "output" },
  1314. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1315. { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
  1316. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1317. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1318. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1319. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1320. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1321. },
  1322. },
  1323. {
  1324. LLM_ARCH_NEMOTRON,
  1325. {
  1326. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1327. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1328. { LLM_TENSOR_OUTPUT, "output" },
  1329. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1330. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1331. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1332. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1333. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1334. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1335. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  1336. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1337. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1338. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1339. },
  1340. },
  1341. {
  1342. LLM_ARCH_EXAONE,
  1343. {
  1344. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1345. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1346. { LLM_TENSOR_OUTPUT, "output" },
  1347. { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
  1348. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1349. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1350. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1351. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1352. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1353. { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
  1354. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1355. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1356. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1357. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1358. },
  1359. },
  1360. {
  1361. LLM_ARCH_RWKV6,
  1362. {
  1363. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1364. { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
  1365. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1366. { LLM_TENSOR_OUTPUT, "output" },
  1367. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1368. { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
  1369. { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
  1370. { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
  1371. { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
  1372. { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
  1373. { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
  1374. { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
  1375. { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
  1376. { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
  1377. { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
  1378. { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
  1379. { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
  1380. { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
  1381. { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
  1382. { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
  1383. { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
  1384. { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
  1385. { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
  1386. { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
  1387. { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
  1388. { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
  1389. { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
  1390. { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
  1391. { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
  1392. },
  1393. },
  1394. {
  1395. LLM_ARCH_SOLAR,
  1396. {
  1397. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1398. { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
  1399. { LLM_TENSOR_OUTPUT, "output" },
  1400. { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
  1401. { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
  1402. { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
  1403. { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
  1404. { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
  1405. { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
  1406. { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
  1407. { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
  1408. { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
  1409. { LLM_TENSOR_BSKCN_TV, "bskcn_tv" },
  1410. },
  1411. },
  1412. {
  1413. LLM_ARCH_UNKNOWN,
  1414. {
  1415. { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
  1416. },
  1417. },
  1418. };
  1419. static llm_arch llm_arch_from_string(const std::string & name) {
  1420. for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
  1421. if (kv.second == name) {
  1422. return kv.first;
  1423. }
  1424. }
  1425. return LLM_ARCH_UNKNOWN;
  1426. }
  1427. // helper to handle gguf constants
  1428. // usage:
  1429. //
  1430. // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
  1431. //
  1432. // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
  1433. // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
  1434. // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
  1435. //
  1436. struct LLM_TN {
  1437. LLM_TN(llm_arch arch) : arch(arch) {}
  1438. llm_arch arch;
  1439. std::string operator()(llm_tensor tensor) const {
  1440. if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
  1441. return "__missing__";
  1442. }
  1443. return LLM_TENSOR_NAMES.at(arch).at(tensor);
  1444. }
  1445. std::string operator()(llm_tensor tensor, const std::string & suffix) const {
  1446. if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
  1447. return "__missing__";
  1448. }
  1449. return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix;
  1450. }
  1451. std::string operator()(llm_tensor tensor, int bid) const {
  1452. if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
  1453. return "__missing__";
  1454. }
  1455. return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid);
  1456. }
  1457. std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const {
  1458. if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
  1459. return "__missing__";
  1460. }
  1461. return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix;
  1462. }
  1463. std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const {
  1464. if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) {
  1465. return "__missing__";
  1466. }
  1467. return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix;
  1468. }
  1469. };
  1470. //
  1471. // gguf helpers
  1472. //
  1473. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  1474. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  1475. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  1476. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  1477. };
  1478. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  1479. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  1480. if (kv.second == name) {
  1481. return (llama_rope_scaling_type) kv.first;
  1482. }
  1483. }
  1484. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  1485. }
  1486. static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
  1487. switch (type) {
  1488. case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
  1489. case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
  1490. case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
  1491. case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
  1492. case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
  1493. case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
  1494. case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
  1495. case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
  1496. case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
  1497. case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
  1498. case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
  1499. default: return format("unknown type %d", type);
  1500. }
  1501. }
  1502. static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
  1503. const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
  1504. switch (type) {
  1505. case GGUF_TYPE_STRING:
  1506. return gguf_get_val_str(ctx_gguf, i);
  1507. case GGUF_TYPE_ARRAY:
  1508. {
  1509. const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
  1510. int arr_n = gguf_get_arr_n(ctx_gguf, i);
  1511. const void * data = gguf_get_arr_data(ctx_gguf, i);
  1512. std::stringstream ss;
  1513. ss << "[";
  1514. for (int j = 0; j < arr_n; j++) {
  1515. if (arr_type == GGUF_TYPE_STRING) {
  1516. std::string val = gguf_get_arr_str(ctx_gguf, i, j);
  1517. // escape quotes
  1518. replace_all(val, "\\", "\\\\");
  1519. replace_all(val, "\"", "\\\"");
  1520. ss << '"' << val << '"';
  1521. } else if (arr_type == GGUF_TYPE_ARRAY) {
  1522. ss << "???";
  1523. } else {
  1524. ss << gguf_data_to_str(arr_type, data, j);
  1525. }
  1526. if (j < arr_n - 1) {
  1527. ss << ", ";
  1528. }
  1529. }
  1530. ss << "]";
  1531. return ss.str();
  1532. }
  1533. default:
  1534. return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
  1535. }
  1536. }
  1537. //
  1538. // llama helpers
  1539. //
  1540. #if defined(_WIN32)
  1541. static std::string llama_format_win_err(DWORD err) {
  1542. LPSTR buf;
  1543. size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
  1544. NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
  1545. if (!size) {
  1546. return "FormatMessageA failed";
  1547. }
  1548. std::string ret(buf, size);
  1549. LocalFree(buf);
  1550. return ret;
  1551. }
  1552. #endif
  1553. template <typename T>
  1554. struct no_init {
  1555. T value;
  1556. no_init() { /* do nothing */ }
  1557. };
  1558. struct llama_file {
  1559. #if defined(_WIN32)
  1560. // use FILE * so we don't have to re-open the file to mmap
  1561. FILE * fp;
  1562. HANDLE fp_win32;
  1563. size_t size;
  1564. private:
  1565. std::string GetErrorMessageWin32(DWORD error_code) const {
  1566. std::string ret;
  1567. LPSTR lpMsgBuf = NULL;
  1568. DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
  1569. NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
  1570. if (!bufLen) {
  1571. ret = format("Win32 error code: %s", error_code);
  1572. } else {
  1573. ret = lpMsgBuf;
  1574. LocalFree(lpMsgBuf);
  1575. }
  1576. return ret;
  1577. }
  1578. public:
  1579. llama_file(const char * fname, const char * mode) {
  1580. fp = ggml_fopen(fname, mode);
  1581. if (fp == NULL) {
  1582. throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
  1583. }
  1584. fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
  1585. seek(0, SEEK_END);
  1586. size = tell();
  1587. seek(0, SEEK_SET);
  1588. }
  1589. size_t tell() const {
  1590. // SetFilePointerEx returns the current position when seeking relative 0 bytes
  1591. LARGE_INTEGER li;
  1592. li.QuadPart = 0;
  1593. BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
  1594. if (!ret) {
  1595. throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
  1596. }
  1597. return li.QuadPart;
  1598. }
  1599. void seek(size_t offset, int whence) const {
  1600. // no need to convert SEEK_* to FILE_*. The enums are the same.
  1601. // Still, keep static asserts to avoid failures in the future.
  1602. static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
  1603. static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
  1604. static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
  1605. LARGE_INTEGER li;
  1606. li.QuadPart = offset;
  1607. BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
  1608. if (!ret) {
  1609. throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
  1610. }
  1611. }
  1612. void read_raw(void * ptr, size_t len) const {
  1613. // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
  1614. // use the Win32 API to do file io instead of the C/C++ library functions.
  1615. // There are conditions under which ReadFile cannot read chunks >64MB.
  1616. // Thus split the operation into smaller chunks if len exceeds this limit.
  1617. size_t bytes_read = 0;
  1618. while (bytes_read < len) {
  1619. size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
  1620. DWORD chunk_read = 0;
  1621. BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
  1622. if (!result) {
  1623. throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
  1624. }
  1625. if (chunk_read < chunk_size || chunk_read == 0) {
  1626. throw std::runtime_error("unexpectedly reached end of file");
  1627. }
  1628. bytes_read += chunk_read;
  1629. } ;
  1630. }
  1631. uint32_t read_u32() const {
  1632. uint32_t val;
  1633. read_raw(&val, sizeof(val));
  1634. return val;
  1635. }
  1636. void write_raw(const void * ptr, size_t len) const {
  1637. // There are conditions under which WriteFile cannot write chunks >64MB.
  1638. // Thus split the operation into smaller chunks if len exceeds this limit.
  1639. size_t bytes_written = 0;
  1640. while (bytes_written < len) {
  1641. size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
  1642. DWORD chunk_written = 0;
  1643. BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
  1644. if (!result) {
  1645. throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
  1646. }
  1647. if (chunk_written < chunk_size || chunk_written == 0) {
  1648. throw std::runtime_error("unexpectedly failed to write bytes");
  1649. }
  1650. bytes_written += chunk_written;
  1651. }
  1652. }
  1653. void write_u32(std::uint32_t val) const {
  1654. write_raw(&val, sizeof(val));
  1655. }
  1656. ~llama_file() {
  1657. if (fp) {
  1658. std::fclose(fp);
  1659. }
  1660. }
  1661. #else
  1662. // use FILE * so we don't have to re-open the file to mmap
  1663. FILE * fp;
  1664. size_t size;
  1665. llama_file(const char * fname, const char * mode) {
  1666. fp = ggml_fopen(fname, mode);
  1667. if (fp == NULL) {
  1668. throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
  1669. }
  1670. seek(0, SEEK_END);
  1671. size = tell();
  1672. seek(0, SEEK_SET);
  1673. }
  1674. size_t tell() const {
  1675. #ifdef _WIN32
  1676. __int64 ret = _ftelli64(fp);
  1677. #else
  1678. long ret = std::ftell(fp);
  1679. #endif
  1680. if (ret == -1) {
  1681. throw std::runtime_error(format("ftell error: %s", strerror(errno)));
  1682. }
  1683. return (size_t) ret;
  1684. }
  1685. void seek(size_t offset, int whence) const {
  1686. #ifdef _WIN32
  1687. int ret = _fseeki64(fp, (__int64) offset, whence);
  1688. #else
  1689. int ret = std::fseek(fp, (long) offset, whence);
  1690. #endif
  1691. if (ret != 0) {
  1692. throw std::runtime_error(format("seek error: %s", strerror(errno)));
  1693. }
  1694. }
  1695. void read_raw(void * ptr, size_t len) const {
  1696. if (len == 0) {
  1697. return;
  1698. }
  1699. errno = 0;
  1700. std::size_t ret = std::fread(ptr, len, 1, fp);
  1701. if (ferror(fp)) {
  1702. throw std::runtime_error(format("read error: %s", strerror(errno)));
  1703. }
  1704. if (ret != 1) {
  1705. throw std::runtime_error("unexpectedly reached end of file");
  1706. }
  1707. }
  1708. uint32_t read_u32() const {
  1709. uint32_t ret;
  1710. read_raw(&ret, sizeof(ret));
  1711. return ret;
  1712. }
  1713. void write_raw(const void * ptr, size_t len) const {
  1714. if (len == 0) {
  1715. return;
  1716. }
  1717. errno = 0;
  1718. size_t ret = std::fwrite(ptr, len, 1, fp);
  1719. if (ret != 1) {
  1720. throw std::runtime_error(format("write error: %s", strerror(errno)));
  1721. }
  1722. }
  1723. void write_u32(std::uint32_t val) const {
  1724. write_raw(&val, sizeof(val));
  1725. }
  1726. ~llama_file() {
  1727. if (fp) {
  1728. std::fclose(fp);
  1729. }
  1730. }
  1731. #endif
  1732. };
  1733. using llama_files = std::vector<std::unique_ptr<llama_file>>;
  1734. struct llama_mmap {
  1735. void * addr;
  1736. size_t size;
  1737. llama_mmap(const llama_mmap &) = delete;
  1738. #ifdef _POSIX_MAPPED_FILES
  1739. static constexpr bool SUPPORTED = true;
  1740. // list of mapped fragments (first_offset, last_offset)
  1741. std::vector<std::pair<size_t, size_t>> mapped_fragments;
  1742. llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
  1743. size = file->size;
  1744. int fd = fileno(file->fp);
  1745. int flags = MAP_SHARED;
  1746. // prefetch/readahead impairs performance on NUMA systems
  1747. if (numa) { prefetch = 0; }
  1748. #ifdef __linux__
  1749. // advise the kernel to read the file sequentially (increases readahead)
  1750. if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
  1751. LLAMA_LOG_WARN("warning: posix_fadvise(.., POSIX_FADV_SEQUENTIAL) failed: %s\n",
  1752. strerror(errno));
  1753. }
  1754. if (prefetch) { flags |= MAP_POPULATE; }
  1755. #endif
  1756. addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
  1757. if (addr == MAP_FAILED) { // NOLINT
  1758. throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
  1759. }
  1760. if (prefetch > 0) {
  1761. // advise the kernel to preload the mapped memory
  1762. if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
  1763. LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
  1764. strerror(errno));
  1765. }
  1766. }
  1767. if (numa) {
  1768. // advise the kernel not to use readahead
  1769. // (because the next page might not belong on the same node)
  1770. if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
  1771. LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
  1772. strerror(errno));
  1773. }
  1774. }
  1775. // initialize list of mapped_fragments
  1776. mapped_fragments.emplace_back(0, file->size);
  1777. }
  1778. static void align_range(size_t * first, size_t * last, size_t page_size) {
  1779. // align first to the next page
  1780. size_t offset_in_page = *first & (page_size - 1);
  1781. size_t offset_to_page = offset_in_page == 0 ? 0 : page_size - offset_in_page;
  1782. *first += offset_to_page;
  1783. // align last to the previous page
  1784. *last = *last & ~(page_size - 1);
  1785. if (*last <= *first) {
  1786. *last = *first;
  1787. }
  1788. }
  1789. // partially unmap the file in the range [first, last)
  1790. void unmap_fragment(size_t first, size_t last) {
  1791. // note: this function must not be called multiple times with overlapping ranges
  1792. // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings
  1793. int page_size = sysconf(_SC_PAGESIZE);
  1794. align_range(&first, &last, page_size);
  1795. size_t len = last - first;
  1796. if (len == 0) {
  1797. return;
  1798. }
  1799. GGML_ASSERT(first % page_size == 0);
  1800. GGML_ASSERT(last % page_size == 0);
  1801. GGML_ASSERT(last > first);
  1802. void * next_page_start = (uint8_t *) addr + first;
  1803. // unmap the range
  1804. if (munmap(next_page_start, len)) {
  1805. LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
  1806. }
  1807. // update the list of mapped fragments to avoid unmapping the same range again in the destructor
  1808. std::vector<std::pair<size_t, size_t>> new_mapped_fragments;
  1809. for (const auto & frag : mapped_fragments) {
  1810. if (frag.first < first && frag.second > last) {
  1811. // the range is in the middle of the fragment, split it
  1812. new_mapped_fragments.emplace_back(frag.first, first);
  1813. new_mapped_fragments.emplace_back(last, frag.second);
  1814. } else if (frag.first < first && frag.second > first) {
  1815. // the range starts in the middle of the fragment
  1816. new_mapped_fragments.emplace_back(frag.first, first);
  1817. } else if (frag.first < last && frag.second > last) {
  1818. // the range ends in the middle of the fragment
  1819. new_mapped_fragments.emplace_back(last, frag.second);
  1820. } else if (frag.first >= first && frag.second <= last) {
  1821. // the range covers the entire fragment
  1822. } else {
  1823. // the range is outside the fragment
  1824. new_mapped_fragments.push_back(frag);
  1825. }
  1826. }
  1827. mapped_fragments = std::move(new_mapped_fragments);
  1828. }
  1829. ~llama_mmap() {
  1830. for (const auto & frag : mapped_fragments) {
  1831. if (munmap((char *) addr + frag.first, frag.second - frag.first)) {
  1832. LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
  1833. }
  1834. }
  1835. }
  1836. #elif defined(_WIN32)
  1837. static constexpr bool SUPPORTED = true;
  1838. llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) {
  1839. GGML_UNUSED(numa);
  1840. size = file->size;
  1841. HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
  1842. HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
  1843. if (hMapping == NULL) {
  1844. DWORD error = GetLastError();
  1845. throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
  1846. }
  1847. addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
  1848. DWORD error = GetLastError();
  1849. CloseHandle(hMapping);
  1850. if (addr == NULL) {
  1851. throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
  1852. }
  1853. if (prefetch > 0) {
  1854. #if _WIN32_WINNT >= 0x602
  1855. // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
  1856. BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
  1857. HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
  1858. // may fail on pre-Windows 8 systems
  1859. pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
  1860. if (pPrefetchVirtualMemory) {
  1861. // advise the kernel to preload the mapped memory
  1862. WIN32_MEMORY_RANGE_ENTRY range;
  1863. range.VirtualAddress = addr;
  1864. range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
  1865. if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
  1866. LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
  1867. llama_format_win_err(GetLastError()).c_str());
  1868. }
  1869. }
  1870. #else
  1871. throw std::runtime_error("PrefetchVirtualMemory unavailable");
  1872. #endif
  1873. }
  1874. }
  1875. void unmap_fragment(size_t first, size_t last) {
  1876. // not supported
  1877. GGML_UNUSED(first);
  1878. GGML_UNUSED(last);
  1879. }
  1880. ~llama_mmap() {
  1881. if (!UnmapViewOfFile(addr)) {
  1882. LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n",
  1883. llama_format_win_err(GetLastError()).c_str());
  1884. }
  1885. }
  1886. #else
  1887. static constexpr bool SUPPORTED = false;
  1888. llama_mmap(struct llama_file * file, size_t prefetch = -1, bool numa = false) {
  1889. GGML_UNUSED(file);
  1890. GGML_UNUSED(prefetch);
  1891. GGML_UNUSED(numa);
  1892. throw std::runtime_error("mmap not supported");
  1893. }
  1894. void unmap_fragment(size_t first, size_t last) {
  1895. GGML_UNUSED(first);
  1896. GGML_UNUSED(last);
  1897. throw std::runtime_error("mmap not supported");
  1898. }
  1899. #endif
  1900. };
  1901. using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
  1902. // Represents some region of memory being locked using mlock or VirtualLock;
  1903. // will automatically unlock on destruction.
  1904. struct llama_mlock {
  1905. void * addr = NULL;
  1906. size_t size = 0;
  1907. bool failed_already = false;
  1908. llama_mlock() {}
  1909. llama_mlock(const llama_mlock &) = delete;
  1910. ~llama_mlock() {
  1911. if (size) {
  1912. raw_unlock(addr, size);
  1913. }
  1914. }
  1915. void init(void * ptr) {
  1916. GGML_ASSERT(addr == NULL && size == 0); // NOLINT
  1917. addr = ptr;
  1918. }
  1919. void grow_to(size_t target_size) {
  1920. GGML_ASSERT(addr);
  1921. if (failed_already) {
  1922. return;
  1923. }
  1924. size_t granularity = lock_granularity();
  1925. target_size = (target_size + granularity - 1) & ~(granularity - 1);
  1926. if (target_size > size) {
  1927. if (raw_lock((uint8_t *) addr + size, target_size - size)) {
  1928. size = target_size;
  1929. } else {
  1930. failed_already = true;
  1931. }
  1932. }
  1933. }
  1934. #ifdef _POSIX_MEMLOCK_RANGE
  1935. static constexpr bool SUPPORTED = true;
  1936. static size_t lock_granularity() {
  1937. return (size_t) sysconf(_SC_PAGESIZE);
  1938. }
  1939. #ifdef __APPLE__
  1940. #define MLOCK_SUGGESTION \
  1941. "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
  1942. "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MEMLOCK (ulimit -l).\n"
  1943. #else
  1944. #define MLOCK_SUGGESTION \
  1945. "Try increasing RLIMIT_MEMLOCK ('ulimit -l' as root).\n"
  1946. #endif
  1947. bool raw_lock(const void * addr, size_t size) const {
  1948. if (!mlock(addr, size)) {
  1949. return true;
  1950. }
  1951. char* errmsg = std::strerror(errno);
  1952. bool suggest = (errno == ENOMEM);
  1953. // Check if the resource limit is fine after all
  1954. struct rlimit lock_limit;
  1955. if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
  1956. suggest = false;
  1957. }
  1958. if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
  1959. suggest = false;
  1960. }
  1961. LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
  1962. size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
  1963. return false;
  1964. }
  1965. #undef MLOCK_SUGGESTION
  1966. static void raw_unlock(void * addr, size_t size) {
  1967. if (munlock(addr, size)) {
  1968. LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
  1969. }
  1970. }
  1971. #elif defined(_WIN32)
  1972. static constexpr bool SUPPORTED = true;
  1973. static size_t lock_granularity() {
  1974. SYSTEM_INFO si;
  1975. GetSystemInfo(&si);
  1976. return (size_t) si.dwPageSize;
  1977. }
  1978. bool raw_lock(void * ptr, size_t len) const {
  1979. for (int tries = 1; ; tries++) {
  1980. if (VirtualLock(ptr, len)) {
  1981. return true;
  1982. }
  1983. if (tries == 2) {
  1984. LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
  1985. len, size, llama_format_win_err(GetLastError()).c_str());
  1986. return false;
  1987. }
  1988. // It failed but this was only the first try; increase the working
  1989. // set size and try again.
  1990. SIZE_T min_ws_size, max_ws_size;
  1991. if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
  1992. LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
  1993. llama_format_win_err(GetLastError()).c_str());
  1994. return false;
  1995. }
  1996. // Per MSDN: "The maximum number of pages that a process can lock
  1997. // is equal to the number of pages in its minimum working set minus
  1998. // a small overhead."
  1999. // Hopefully a megabyte is enough overhead:
  2000. size_t increment = len + 1048576;
  2001. // The minimum must be <= the maximum, so we need to increase both:
  2002. min_ws_size += increment;
  2003. max_ws_size += increment;
  2004. if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
  2005. LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
  2006. llama_format_win_err(GetLastError()).c_str());
  2007. return false;
  2008. }
  2009. }
  2010. }
  2011. static void raw_unlock(void * ptr, size_t len) {
  2012. if (!VirtualUnlock(ptr, len)) {
  2013. LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
  2014. llama_format_win_err(GetLastError()).c_str());
  2015. }
  2016. }
  2017. #else
  2018. static constexpr bool SUPPORTED = false;
  2019. static size_t lock_granularity() {
  2020. return (size_t) 65536;
  2021. }
  2022. bool raw_lock(const void * addr, size_t len) const {
  2023. LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
  2024. return false;
  2025. }
  2026. static void raw_unlock(const void * addr, size_t len) {}
  2027. #endif
  2028. };
  2029. using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
  2030. // NOTE: avoid ever using this except for building the token_to_piece caches
  2031. static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
  2032. std::string piece;
  2033. piece.resize(piece.capacity()); // using string internal cache
  2034. const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  2035. if (n_chars < 0) {
  2036. piece.resize(-n_chars);
  2037. int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  2038. GGML_ASSERT(check == -n_chars);
  2039. }
  2040. else {
  2041. piece.resize(n_chars);
  2042. }
  2043. return piece;
  2044. }
  2045. static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
  2046. ggml_backend_buffer_type_t buft = nullptr;
  2047. #if defined(GGML_USE_CUDA)
  2048. // host buffers should only be used when data is expected to be copied to/from the GPU
  2049. if (host_buffer) {
  2050. buft = ggml_backend_cuda_host_buffer_type();
  2051. }
  2052. #elif defined(GGML_USE_SYCL)
  2053. if (host_buffer) {
  2054. buft = ggml_backend_sycl_host_buffer_type();
  2055. }
  2056. #elif defined(GGML_USE_CPU_HBM)
  2057. buft = ggml_backend_cpu_hbm_buffer_type();
  2058. #elif defined(GGML_USE_VULKAN)
  2059. if (host_buffer) {
  2060. buft = ggml_backend_vk_host_buffer_type();
  2061. }
  2062. #endif
  2063. if (buft == nullptr) {
  2064. buft = ggml_backend_cpu_buffer_type();
  2065. }
  2066. return buft;
  2067. GGML_UNUSED(host_buffer);
  2068. }
  2069. //
  2070. // globals
  2071. //
  2072. struct llama_state {
  2073. llama_state() {
  2074. #ifdef GGML_USE_METAL
  2075. ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
  2076. #elif defined(GGML_USE_CUDA)
  2077. ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
  2078. #elif defined(GGML_USE_CANN)
  2079. ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
  2080. #endif
  2081. }
  2082. // We save the log callback globally
  2083. ggml_log_callback log_callback = llama_log_callback_default;
  2084. void * log_callback_user_data = nullptr;
  2085. };
  2086. static llama_state g_state;
  2087. // available llama models
  2088. enum e_model {
  2089. MODEL_UNKNOWN,
  2090. MODEL_14M,
  2091. MODEL_17M,
  2092. MODEL_22M,
  2093. MODEL_33M,
  2094. MODEL_60M,
  2095. MODEL_70M,
  2096. MODEL_80M,
  2097. MODEL_109M,
  2098. MODEL_137M,
  2099. MODEL_160M,
  2100. MODEL_220M,
  2101. MODEL_250M,
  2102. MODEL_270M,
  2103. MODEL_335M,
  2104. MODEL_410M,
  2105. MODEL_450M,
  2106. MODEL_770M,
  2107. MODEL_780M,
  2108. MODEL_0_5B,
  2109. MODEL_1B,
  2110. MODEL_1_3B,
  2111. MODEL_1_4B,
  2112. MODEL_1_6B,
  2113. MODEL_2B,
  2114. MODEL_2_8B,
  2115. MODEL_3B,
  2116. MODEL_4B,
  2117. MODEL_6B,
  2118. MODEL_6_9B,
  2119. MODEL_7B,
  2120. MODEL_8B,
  2121. MODEL_9B,
  2122. MODEL_11B,
  2123. MODEL_12B,
  2124. MODEL_13B,
  2125. MODEL_14B,
  2126. MODEL_15B,
  2127. MODEL_16B,
  2128. MODEL_20B,
  2129. MODEL_22B,
  2130. MODEL_30B,
  2131. MODEL_34B,
  2132. MODEL_35B,
  2133. MODEL_40B,
  2134. MODEL_65B,
  2135. MODEL_70B,
  2136. MODEL_236B,
  2137. MODEL_314B,
  2138. MODEL_SMALL,
  2139. MODEL_MEDIUM,
  2140. MODEL_LARGE,
  2141. MODEL_XL,
  2142. MODEL_A2_7B,
  2143. MODEL_8x7B,
  2144. MODEL_8x22B,
  2145. MODEL_16x12B,
  2146. MODEL_10B_128x3_66B,
  2147. MODEL_57B_A14B,
  2148. MODEL_27B,
  2149. };
  2150. static const size_t kiB = 1024;
  2151. static const size_t MiB = 1024*kiB;
  2152. static const size_t GiB = 1024*MiB;
  2153. struct llama_hparams {
  2154. bool vocab_only;
  2155. bool rope_finetuned;
  2156. bool use_par_res;
  2157. uint32_t n_vocab;
  2158. uint32_t n_ctx_train; // context size the model was trained on
  2159. uint32_t n_embd;
  2160. uint32_t n_layer;
  2161. uint32_t n_rot;
  2162. uint32_t n_swa = 0; // sliding window attention (SWA)
  2163. uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
  2164. uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
  2165. uint32_t n_expert = 0;
  2166. uint32_t n_expert_used = 0;
  2167. uint32_t n_vocab_type = 0; // for BERT-style token types
  2168. uint32_t n_rel_attn_bkts = 0;
  2169. std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
  2170. std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
  2171. std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
  2172. std::array<std::array<uint32_t, LLAMA_MAX_LAYERS>, 4> n_bskcn_arr;
  2173. uint32_t n_layer_dense_lead = 0;
  2174. uint32_t n_lora_q = 0;
  2175. uint32_t n_lora_kv = 0;
  2176. uint32_t n_ff_exp = 0;
  2177. uint32_t n_ff_shexp = 0;
  2178. uint32_t n_expert_shared = 0;
  2179. float expert_weights_scale = 0.0;
  2180. float f_norm_eps;
  2181. float f_norm_rms_eps;
  2182. float f_attn_logit_softcapping = 50.0f;
  2183. float f_final_logit_softcapping = 30.0f;
  2184. // for RWKV
  2185. uint32_t rescale_every_n_layers = 0;
  2186. uint32_t time_mix_extra_dim = 0;
  2187. uint32_t time_decay_extra_dim = 0;
  2188. uint32_t wkv_head_size = 0;
  2189. float rope_attn_factor = 1.0f;
  2190. float rope_freq_base_train;
  2191. float rope_freq_scale_train;
  2192. uint32_t n_ctx_orig_yarn;
  2193. float rope_yarn_log_mul;
  2194. // for State Space Models
  2195. uint32_t ssm_d_conv = 0;
  2196. uint32_t ssm_d_inner = 0;
  2197. uint32_t ssm_d_state = 0;
  2198. uint32_t ssm_dt_rank = 0;
  2199. bool ssm_dt_b_c_rms = false;
  2200. float f_clamp_kqv = 0.0f;
  2201. float f_max_alibi_bias = 0.0f;
  2202. float f_logit_scale = 0.0f;
  2203. bool causal_attn = true;
  2204. bool use_alibi = false;
  2205. bool attn_soft_cap = false;
  2206. // needed by encoder-decoder models (e.g. T5, FLAN-T5)
  2207. // ref: https://github.com/ggerganov/llama.cpp/pull/8141
  2208. llama_token dec_start_token_id = -1;
  2209. enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
  2210. enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
  2211. enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
  2212. bool operator!=(const llama_hparams & other) const {
  2213. if (this->vocab_only != other.vocab_only) return true;
  2214. if (this->n_vocab != other.n_vocab) return true;
  2215. if (this->n_ctx_train != other.n_ctx_train) return true;
  2216. if (this->n_embd != other.n_embd) return true;
  2217. if (this->n_layer != other.n_layer) return true;
  2218. if (this->n_rot != other.n_rot) return true;
  2219. if (this->n_swa != other.n_swa) return true;
  2220. if (this->n_embd_head_k != other.n_embd_head_k) return true;
  2221. if (this->n_embd_head_v != other.n_embd_head_v) return true;
  2222. if (this->n_expert != other.n_expert) return true;
  2223. if (this->n_expert_used != other.n_expert_used) return true;
  2224. if (this->n_head_arr != other.n_head_arr) return true;
  2225. if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
  2226. if (this->n_ff_arr != other.n_ff_arr) return true;
  2227. if (this->n_bskcn_arr != other.n_bskcn_arr) return true;
  2228. if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
  2229. if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
  2230. if (this->n_lora_q != other.n_lora_q) return true;
  2231. if (this->n_lora_kv != other.n_lora_kv) return true;
  2232. if (this->n_ff_exp != other.n_ff_exp) return true;
  2233. if (this->n_ff_shexp != other.n_ff_shexp) return true;
  2234. if (this->n_expert_shared != other.n_expert_shared) return true;
  2235. if (this->rope_finetuned != other.rope_finetuned) return true;
  2236. if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
  2237. if (this->ssm_d_conv != other.ssm_d_conv) return true;
  2238. if (this->ssm_d_inner != other.ssm_d_inner) return true;
  2239. if (this->ssm_d_state != other.ssm_d_state) return true;
  2240. if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
  2241. if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
  2242. if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
  2243. if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
  2244. if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
  2245. if (this->wkv_head_size != other.wkv_head_size) return true;
  2246. if (this->dec_start_token_id != other.dec_start_token_id) return true;
  2247. const float EPSILON = 1e-9f;
  2248. if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
  2249. if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
  2250. if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
  2251. if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
  2252. if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
  2253. if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
  2254. if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
  2255. return false;
  2256. }
  2257. uint32_t n_head(uint32_t il = 0) const {
  2258. if (il < n_layer) {
  2259. return n_head_arr[il];
  2260. }
  2261. GGML_ABORT("fatal error");
  2262. }
  2263. uint32_t n_head_kv(uint32_t il = 0) const {
  2264. if (il < n_layer) {
  2265. return n_head_kv_arr[il];
  2266. }
  2267. GGML_ABORT("fatal error");
  2268. }
  2269. uint32_t n_ff(uint32_t il = 0) const {
  2270. if (il < n_layer) {
  2271. return n_ff_arr[il];
  2272. }
  2273. GGML_ABORT("fatal error");
  2274. }
  2275. uint32_t n_gqa(uint32_t il = 0) const {
  2276. const uint32_t n_head = this->n_head(il);
  2277. const uint32_t n_head_kv = this->n_head_kv(il);
  2278. if (n_head_kv == 0) {
  2279. return 0;
  2280. }
  2281. return n_head/n_head_kv;
  2282. }
  2283. uint32_t n_embd_k_gqa(uint32_t il = 0) const { // dimension of key embeddings across all k-v heads
  2284. const uint32_t n_head_kv = this->n_head_kv(il);
  2285. return n_embd_head_k * n_head_kv;
  2286. }
  2287. uint32_t n_embd_v_gqa(uint32_t il = 0) const { // dimension of value embeddings across all k-v heads
  2288. const uint32_t n_head_kv = this->n_head_kv(il);
  2289. return n_embd_head_v * n_head_kv;
  2290. }
  2291. uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
  2292. // corresponds to Mamba's conv_states size or RWKV's token_shift states size
  2293. if (wkv_head_size != 0) {
  2294. // for RWKV models
  2295. return 2 * n_embd;
  2296. } else {
  2297. // TODO: maybe support other convolution strides than 1
  2298. // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
  2299. return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
  2300. }
  2301. }
  2302. uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
  2303. if (wkv_head_size != 0) {
  2304. // corresponds to RWKV's wkv_states size
  2305. return n_embd * wkv_head_size;
  2306. } else {
  2307. // corresponds to Mamba's ssm_states size
  2308. return ssm_d_state * ssm_d_inner;
  2309. }
  2310. }
  2311. bool n_bskcn(uint32_t n, uint32_t il = 0) const {
  2312. if (il < n_layer) {
  2313. return n_bskcn_arr[n][il] > 0;
  2314. }
  2315. GGML_ABORT("fatal error");
  2316. }
  2317. };
  2318. static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
  2319. struct llama_cparams {
  2320. uint32_t n_ctx; // context size used during inference
  2321. uint32_t n_batch;
  2322. uint32_t n_ubatch;
  2323. uint32_t n_seq_max;
  2324. int n_threads; // number of threads to use for generation
  2325. int n_threads_batch; // number of threads to use for batch processing
  2326. float rope_freq_base;
  2327. float rope_freq_scale;
  2328. uint32_t n_ctx_orig_yarn;
  2329. // These hyperparameters are not exposed in GGUF, because all
  2330. // existing YaRN models use the same values for them.
  2331. float yarn_ext_factor;
  2332. float yarn_attn_factor;
  2333. float yarn_beta_fast;
  2334. float yarn_beta_slow;
  2335. float defrag_thold;
  2336. bool embeddings;
  2337. bool causal_attn;
  2338. bool offload_kqv;
  2339. bool flash_attn;
  2340. enum llama_pooling_type pooling_type;
  2341. ggml_backend_sched_eval_callback cb_eval;
  2342. void * cb_eval_user_data;
  2343. };
  2344. // TODO: separate into "llama_layer_enc" and "llama_layer_dec"
  2345. struct llama_layer {
  2346. // normalization
  2347. struct ggml_tensor * attn_norm;
  2348. struct ggml_tensor * attn_norm_b;
  2349. struct ggml_tensor * attn_norm_2;
  2350. struct ggml_tensor * attn_norm_2_b;
  2351. struct ggml_tensor * attn_q_norm;
  2352. struct ggml_tensor * attn_q_norm_b;
  2353. struct ggml_tensor * attn_k_norm;
  2354. struct ggml_tensor * attn_k_norm_b;
  2355. struct ggml_tensor * attn_out_norm;
  2356. struct ggml_tensor * attn_out_norm_b;
  2357. struct ggml_tensor * attn_q_a_norm;
  2358. struct ggml_tensor * attn_kv_a_norm;
  2359. struct ggml_tensor * attn_sub_norm;
  2360. struct ggml_tensor * attn_post_norm;
  2361. struct ggml_tensor * ffn_sub_norm;
  2362. struct ggml_tensor * attn_norm_cross;
  2363. struct ggml_tensor * attn_norm_enc;
  2364. // attention
  2365. struct ggml_tensor * wq;
  2366. struct ggml_tensor * wk;
  2367. struct ggml_tensor * wv;
  2368. struct ggml_tensor * wo;
  2369. struct ggml_tensor * wqkv;
  2370. struct ggml_tensor * wq_a;
  2371. struct ggml_tensor * wq_b;
  2372. struct ggml_tensor * wkv_a_mqa;
  2373. struct ggml_tensor * wkv_b;
  2374. struct ggml_tensor * wq_cross;
  2375. struct ggml_tensor * wk_cross;
  2376. struct ggml_tensor * wv_cross;
  2377. struct ggml_tensor * wo_cross;
  2378. struct ggml_tensor * wq_enc;
  2379. struct ggml_tensor * wk_enc;
  2380. struct ggml_tensor * wv_enc;
  2381. struct ggml_tensor * wo_enc;
  2382. // attention bias
  2383. struct ggml_tensor * bq;
  2384. struct ggml_tensor * bk;
  2385. struct ggml_tensor * bv;
  2386. struct ggml_tensor * bo;
  2387. struct ggml_tensor * bqkv;
  2388. // relative position bias
  2389. struct ggml_tensor * attn_rel_b;
  2390. struct ggml_tensor * attn_rel_b_enc;
  2391. struct ggml_tensor * attn_rel_b_cross;
  2392. // normalization
  2393. struct ggml_tensor * ffn_norm;
  2394. struct ggml_tensor * ffn_norm_b;
  2395. struct ggml_tensor * ffn_post_norm;
  2396. struct ggml_tensor * layer_out_norm;
  2397. struct ggml_tensor * layer_out_norm_b;
  2398. struct ggml_tensor * ffn_norm_exps;
  2399. struct ggml_tensor * ffn_norm_enc;
  2400. // ff
  2401. struct ggml_tensor * ffn_gate; // w1
  2402. struct ggml_tensor * ffn_down; // w2
  2403. struct ggml_tensor * ffn_up; // w3
  2404. struct ggml_tensor * ffn_gate_enc;
  2405. struct ggml_tensor * ffn_down_enc;
  2406. struct ggml_tensor * ffn_up_enc;
  2407. // ff MoE
  2408. struct ggml_tensor * ffn_gate_inp;
  2409. struct ggml_tensor * ffn_gate_exps;
  2410. struct ggml_tensor * ffn_down_exps;
  2411. struct ggml_tensor * ffn_up_exps ;
  2412. // ff shared expert (shexp)
  2413. struct ggml_tensor * ffn_gate_inp_shexp;
  2414. struct ggml_tensor * ffn_gate_shexp;
  2415. struct ggml_tensor * ffn_down_shexp;
  2416. struct ggml_tensor * ffn_up_shexp;
  2417. // ff bias
  2418. struct ggml_tensor * ffn_gate_b = nullptr;
  2419. struct ggml_tensor * ffn_down_b = nullptr; // b2
  2420. struct ggml_tensor * ffn_up_b = nullptr; // b3
  2421. struct ggml_tensor * ffn_act;
  2422. // mamba proj
  2423. struct ggml_tensor * ssm_in;
  2424. struct ggml_tensor * ssm_x;
  2425. struct ggml_tensor * ssm_dt;
  2426. struct ggml_tensor * ssm_out;
  2427. // mamba
  2428. struct ggml_tensor * ssm_conv1d;
  2429. struct ggml_tensor * ssm_a;
  2430. struct ggml_tensor * ssm_d;
  2431. // mamba bias
  2432. struct ggml_tensor * ssm_conv1d_b;
  2433. struct ggml_tensor * ssm_dt_b;
  2434. // rwkv
  2435. struct ggml_tensor * time_mix_w1;
  2436. struct ggml_tensor * time_mix_w2;
  2437. struct ggml_tensor * time_mix_lerp_x;
  2438. struct ggml_tensor * time_mix_lerp_w;
  2439. struct ggml_tensor * time_mix_lerp_k;
  2440. struct ggml_tensor * time_mix_lerp_v;
  2441. struct ggml_tensor * time_mix_lerp_r;
  2442. struct ggml_tensor * time_mix_lerp_g;
  2443. struct ggml_tensor * time_mix_first;
  2444. struct ggml_tensor * time_mix_decay;
  2445. struct ggml_tensor * time_mix_decay_w1;
  2446. struct ggml_tensor * time_mix_decay_w2;
  2447. struct ggml_tensor * time_mix_key;
  2448. struct ggml_tensor * time_mix_value;
  2449. struct ggml_tensor * time_mix_receptance;
  2450. struct ggml_tensor * time_mix_gate;
  2451. struct ggml_tensor * time_mix_ln;
  2452. struct ggml_tensor * time_mix_ln_b;
  2453. struct ggml_tensor * time_mix_output;
  2454. struct ggml_tensor * channel_mix_lerp_k;
  2455. struct ggml_tensor * channel_mix_lerp_r;
  2456. struct ggml_tensor * channel_mix_key;
  2457. struct ggml_tensor * channel_mix_receptance;
  2458. struct ggml_tensor * channel_mix_value;
  2459. // long rope factors
  2460. struct ggml_tensor * rope_long = nullptr;
  2461. struct ggml_tensor * rope_short = nullptr;
  2462. struct ggml_tensor * rope_freqs = nullptr;
  2463. // bitnet scale
  2464. struct ggml_tensor * wq_scale;
  2465. struct ggml_tensor * wk_scale;
  2466. struct ggml_tensor * wv_scale;
  2467. struct ggml_tensor * wo_scale;
  2468. struct ggml_tensor * ffn_gate_scale;
  2469. struct ggml_tensor * ffn_up_scale;
  2470. struct ggml_tensor * ffn_down_scale;
  2471. struct ggml_tensor * bskcn_tv;
  2472. };
  2473. // very similar to llama_batch,
  2474. // but has more metadata about sequences
  2475. struct llama_ubatch {
  2476. bool equal_seqs;
  2477. // TODO: whole_seqs for embeddings?
  2478. uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
  2479. uint32_t n_seq_tokens; // tokens per sequence
  2480. uint32_t n_seqs;
  2481. llama_token * token; // [n_tokens]
  2482. float * embd; // [n_embd, n_tokens]
  2483. llama_pos * pos; // [n_tokens]
  2484. int32_t * n_seq_id; // [n_seqs]
  2485. llama_seq_id ** seq_id; // [n_seqs]
  2486. int8_t * output; // [n_tokens]
  2487. };
  2488. struct llama_kv_cell {
  2489. llama_pos pos = -1;
  2490. llama_pos delta = 0;
  2491. int32_t src = -1; // used by recurrent state models to copy states
  2492. int32_t tail = -1;
  2493. std::set<llama_seq_id> seq_id;
  2494. bool has_seq_id(const llama_seq_id & id) const {
  2495. return seq_id.find(id) != seq_id.end();
  2496. }
  2497. bool is_empty() const {
  2498. return seq_id.empty();
  2499. }
  2500. bool is_same_seq(const llama_kv_cell & other) const {
  2501. return seq_id == other.seq_id;
  2502. }
  2503. };
  2504. // ring-buffer of cached KV data
  2505. struct llama_kv_cache {
  2506. bool has_shift = false;
  2507. bool do_defrag = false;
  2508. bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
  2509. bool v_trans = true; // the value tensor is transposed
  2510. // Note: The value of head isn't only used to optimize searching
  2511. // for a free KV slot. llama_decode_internal also uses it, so it
  2512. // cannot be freely changed after a slot has been allocated.
  2513. uint32_t head = 0;
  2514. uint32_t size = 0;
  2515. uint32_t used = 0; // used cells (i.e. at least one seq_id)
  2516. // computed before each graph build
  2517. uint32_t n = 0;
  2518. ggml_type type_k = GGML_TYPE_F16;
  2519. ggml_type type_v = GGML_TYPE_F16;
  2520. std::vector<llama_kv_cell> cells;
  2521. std::vector<struct ggml_tensor *> k_l; // per layer
  2522. std::vector<struct ggml_tensor *> v_l;
  2523. std::vector<struct ggml_context *> ctxs;
  2524. std::vector<ggml_backend_buffer_t> bufs;
  2525. size_t total_size() const {
  2526. size_t size = 0;
  2527. for (ggml_backend_buffer_t buf : bufs) {
  2528. size += ggml_backend_buffer_get_size(buf);
  2529. }
  2530. return size;
  2531. }
  2532. ~llama_kv_cache() {
  2533. for (struct ggml_context * ctx : ctxs) {
  2534. ggml_free(ctx);
  2535. }
  2536. for (ggml_backend_buffer_t buf : bufs) {
  2537. ggml_backend_buffer_free(buf);
  2538. }
  2539. }
  2540. };
  2541. struct llama_control_vector {
  2542. std::vector<struct ggml_tensor *> tensors; // per layer
  2543. std::vector<struct ggml_context *> ctxs;
  2544. std::vector<ggml_backend_buffer_t> bufs;
  2545. int32_t layer_start = -1;
  2546. int32_t layer_end = -1;
  2547. struct ggml_tensor * tensor_for(int il) const {
  2548. if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
  2549. return nullptr;
  2550. }
  2551. return tensors[il];
  2552. }
  2553. struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
  2554. ggml_tensor * layer_dir = tensor_for(il);
  2555. if (layer_dir != nullptr) {
  2556. cur = ggml_add(ctx, cur, layer_dir);
  2557. }
  2558. return cur;
  2559. }
  2560. ~llama_control_vector() {
  2561. for (struct ggml_context * ctx : ctxs) {
  2562. ggml_free(ctx);
  2563. }
  2564. for (ggml_backend_buffer_t buf : bufs) {
  2565. ggml_backend_buffer_free(buf);
  2566. }
  2567. }
  2568. };
  2569. struct llama_model {
  2570. e_model type = MODEL_UNKNOWN;
  2571. llm_arch arch = LLM_ARCH_UNKNOWN;
  2572. llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
  2573. std::string name = "n/a";
  2574. llama_hparams hparams = {};
  2575. llama_vocab vocab;
  2576. struct ggml_tensor * tok_embd;
  2577. struct ggml_tensor * type_embd;
  2578. struct ggml_tensor * pos_embd;
  2579. struct ggml_tensor * tok_norm;
  2580. struct ggml_tensor * tok_norm_b;
  2581. struct ggml_tensor * output_norm;
  2582. struct ggml_tensor * output_norm_b;
  2583. struct ggml_tensor * output;
  2584. struct ggml_tensor * output_b;
  2585. struct ggml_tensor * output_norm_enc;
  2586. std::vector<llama_layer> layers;
  2587. llama_split_mode split_mode;
  2588. int main_gpu;
  2589. int n_gpu_layers;
  2590. std::vector<std::string> rpc_servers;
  2591. // gguf metadata
  2592. std::unordered_map<std::string, std::string> gguf_kv;
  2593. // layer -> buffer type mapping
  2594. struct layer_buft {
  2595. layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
  2596. layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
  2597. layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
  2598. ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
  2599. ggml_backend_buffer_type_t buft; // everything else
  2600. };
  2601. layer_buft buft_input;
  2602. layer_buft buft_output;
  2603. std::vector<layer_buft> buft_layer;
  2604. // contexts where the model tensors metadata is stored
  2605. std::vector<struct ggml_context *> ctxs;
  2606. // the model memory buffers for the tensor data
  2607. std::vector<ggml_backend_buffer_t> bufs;
  2608. // model memory mapped files
  2609. llama_mmaps mappings;
  2610. // objects representing data potentially being locked in memory
  2611. llama_mlocks mlock_bufs;
  2612. llama_mlocks mlock_mmaps;
  2613. // for quantize-stats only
  2614. std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
  2615. int64_t t_load_us = 0;
  2616. int64_t t_start_us = 0;
  2617. // keep track of loaded lora adapters
  2618. std::set<struct llama_lora_adapter *> lora_adapters;
  2619. ~llama_model() {
  2620. for (struct ggml_context * ctx : ctxs) {
  2621. ggml_free(ctx);
  2622. }
  2623. for (ggml_backend_buffer_t buf : bufs) {
  2624. #ifdef GGML_USE_CUDA
  2625. if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
  2626. ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
  2627. }
  2628. #endif
  2629. ggml_backend_buffer_free(buf);
  2630. }
  2631. while (!lora_adapters.empty()) {
  2632. llama_lora_adapter_free(*lora_adapters.begin());
  2633. }
  2634. }
  2635. };
  2636. struct llama_sbatch_seq {
  2637. int32_t n_seq_id;
  2638. llama_seq_id * seq_id;
  2639. size_t offset;
  2640. size_t length;
  2641. // helper for smoother batch API transition -- can be deprecated in the future
  2642. llama_seq_id all_seq_id; // used if seq_id == NULL
  2643. };
  2644. // sequence-length-aware batch splitting
  2645. struct llama_sbatch {
  2646. // tokens left in this batch
  2647. size_t n_tokens;
  2648. size_t n_embd;
  2649. bool logits_all; // TODO: remove once lctx.logits_all is removed too
  2650. // sorted indices into the batch
  2651. std::vector<size_t> ids;
  2652. // batch indices of the output
  2653. std::vector<size_t> out_ids;
  2654. std::vector<llama_sbatch_seq> seq;
  2655. const llama_batch * batch = nullptr;
  2656. // buffers for the ubatch
  2657. std::vector<llama_token> ubatch_token;
  2658. std::vector<float> ubatch_embd;
  2659. std::vector<llama_pos> ubatch_pos;
  2660. std::vector<int32_t> ubatch_n_seq_id;
  2661. std::vector<llama_seq_id *> ubatch_seq_id;
  2662. std::vector<int8_t> ubatch_output;
  2663. llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false) {
  2664. // clear empty sequences
  2665. // the previous ubatch is assumed to be gone,
  2666. // so nothing should refer to values in these sequences anymore.
  2667. for (size_t i = seq.size(); i-- > 0;) {
  2668. if (seq[i].length == 0) {
  2669. seq.pop_back();
  2670. } else {
  2671. break;
  2672. }
  2673. }
  2674. ubatch_token.resize(!has_embd ? n_ubatch : 0);
  2675. ubatch_embd.resize(has_embd ? n_embd * n_ubatch : 0);
  2676. ubatch_pos.resize(n_ubatch);
  2677. ubatch_n_seq_id.resize(n_ubatch);
  2678. ubatch_seq_id.resize(n_ubatch);
  2679. ubatch_output.resize(n_ubatch);
  2680. llama_ubatch ubatch = {
  2681. /*equal_seqs =*/ true,
  2682. /*n_tokens =*/ 0,
  2683. /*n_seq_tokens =*/ 0,
  2684. /*n_seqs =*/ 0,
  2685. /*token =*/ !has_embd ? ubatch_token.data() : nullptr,
  2686. /*embd =*/ has_embd ? ubatch_embd.data() : nullptr,
  2687. /*pos =*/ ubatch_pos.data(),
  2688. /*n_seq_id =*/ ubatch_n_seq_id.data(),
  2689. /*seq_id =*/ ubatch_seq_id.data(),
  2690. /*output =*/ ubatch_output.data(),
  2691. };
  2692. return ubatch;
  2693. }
  2694. void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
  2695. GGML_ASSERT(batch != nullptr);
  2696. GGML_ASSERT(length <= seq.length);
  2697. // Can only add sequences of equal lengths to a batch,
  2698. // otherwise it isn't clear to which sequence a token belongs
  2699. GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
  2700. GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
  2701. // NOTE: loops are separated for cache-friendliness
  2702. if (batch->token) {
  2703. if (ubatch.equal_seqs) {
  2704. for (size_t i = 0; i < length; ++i) {
  2705. ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
  2706. }
  2707. } else {
  2708. // simple split
  2709. ubatch.token = batch->token + seq.offset;
  2710. }
  2711. } else {
  2712. ubatch.token = nullptr;
  2713. }
  2714. if (batch->embd) {
  2715. if (ubatch.equal_seqs) {
  2716. for (size_t i = 0; i < length; ++i) {
  2717. memcpy(
  2718. ubatch.embd + n_embd * (ubatch.n_tokens + i),
  2719. batch->embd + n_embd * ids[seq.offset + i],
  2720. n_embd * sizeof(float)
  2721. );
  2722. }
  2723. } else {
  2724. // simple split
  2725. ubatch.embd = batch->embd + (n_embd * seq.offset);
  2726. }
  2727. } else {
  2728. ubatch.embd = nullptr;
  2729. }
  2730. // from here on, the else branches are deprecated;
  2731. // they are helpers for smoother batch API transition
  2732. if (batch->pos) {
  2733. if (ubatch.equal_seqs) {
  2734. for (size_t i = 0; i < length; ++i) {
  2735. ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
  2736. }
  2737. } else {
  2738. // simple split
  2739. ubatch.pos = batch->pos + seq.offset;
  2740. }
  2741. } else {
  2742. for (size_t i = 0; i < length; ++i) {
  2743. llama_pos bi = ids[seq.offset + i];
  2744. ubatch.pos[ubatch.n_tokens + i] = batch->all_pos_0 + (bi * batch->all_pos_1);
  2745. }
  2746. }
  2747. if (ubatch.equal_seqs) {
  2748. ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
  2749. if (seq.seq_id) {
  2750. ubatch.seq_id[ubatch.n_seqs] = seq.seq_id;
  2751. } else {
  2752. GGML_ASSERT(seq.n_seq_id == 1);
  2753. ubatch.seq_id[ubatch.n_seqs] = &seq.all_seq_id;
  2754. }
  2755. } else {
  2756. // simple split
  2757. if (batch->n_seq_id) {
  2758. for (size_t i = 0; i < length; ++i) {
  2759. ubatch.n_seq_id = batch->n_seq_id + seq.offset;
  2760. }
  2761. } else {
  2762. for (size_t i = 0; i < length; ++i) {
  2763. ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
  2764. }
  2765. }
  2766. if (batch->seq_id) {
  2767. for (size_t i = 0; i < length; ++i) {
  2768. ubatch.seq_id = batch->seq_id + seq.offset;
  2769. }
  2770. } else {
  2771. for (size_t i = 0; i < length; ++i) {
  2772. ubatch.seq_id[ubatch.n_seqs + i] = &seq.all_seq_id;
  2773. }
  2774. }
  2775. }
  2776. if (logits_all) {
  2777. for (size_t i = 0; i < length; ++i) {
  2778. ubatch.output[ubatch.n_tokens + i] = 1;
  2779. out_ids.push_back(ids[seq.offset + i]);
  2780. }
  2781. } else if (batch->logits) {
  2782. if (ubatch.equal_seqs) {
  2783. for (size_t i = 0; i < length; ++i) {
  2784. size_t id = ids[seq.offset + i];
  2785. int8_t is_output = batch->logits[id];
  2786. ubatch.output[ubatch.n_tokens + i] = is_output;
  2787. if (is_output) { out_ids.push_back(id); }
  2788. }
  2789. } else {
  2790. // simple split
  2791. ubatch.output = batch->logits + seq.offset;
  2792. for (size_t i = 0; i < length; ++i) {
  2793. if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
  2794. }
  2795. }
  2796. } else {
  2797. // only get last output
  2798. for (size_t i = 0; i < length; ++i) {
  2799. size_t id = ids[seq.offset + i];
  2800. int8_t is_last = id == ids.size() - 1;
  2801. ubatch.output[ubatch.n_tokens + i] = is_last;
  2802. if (is_last) { out_ids.push_back(id); }
  2803. }
  2804. }
  2805. if (ubatch.n_tokens == 0 && ubatch.n_seqs == 0) {
  2806. ubatch.n_seq_tokens = ubatch.equal_seqs ? length : 1;
  2807. }
  2808. ubatch.n_tokens += length;
  2809. ubatch.n_seqs += ubatch.equal_seqs ? 1 : length; // virtual sequences for simple splits
  2810. seq.offset += length;
  2811. seq.length -= length;
  2812. n_tokens -= length;
  2813. GGML_ASSERT(ubatch.n_tokens == ubatch.n_seq_tokens * ubatch.n_seqs);
  2814. }
  2815. // simple split, unknown number of sequences of unequal lengths
  2816. llama_ubatch split_simple(size_t n_ubatch) {
  2817. n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
  2818. llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
  2819. ubatch.equal_seqs = false;
  2820. if (!seq.empty()) {
  2821. llama_sbatch_seq & s = seq[0];
  2822. size_t length = s.length < n_ubatch ? s.length : n_ubatch;
  2823. GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
  2824. add_seq_to_ubatch(ubatch, s, length);
  2825. }
  2826. return ubatch;
  2827. }
  2828. // make batches of equal-length sequences
  2829. llama_ubatch split_equal(size_t n_ubatch) {
  2830. n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
  2831. llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
  2832. if (!seq.empty()) {
  2833. size_t length = 0;
  2834. size_t n_tokens_in_ubatch = 0;
  2835. GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
  2836. // smallest first, because it's easier to split this way;
  2837. // starting from the end to pop in constant time.
  2838. for (size_t i = seq.size(); i-- > 0;) {
  2839. llama_sbatch_seq & s = seq[i];
  2840. GGML_ASSERT(s.length > 0);
  2841. if (length == 0) {
  2842. length = s.length < n_ubatch ? s.length : n_ubatch;
  2843. }
  2844. add_seq_to_ubatch(ubatch, s, length);
  2845. n_tokens_in_ubatch += length;
  2846. // shared prompts can't be mixed with any of their sequences,
  2847. // so it's safer to compute them in their own ubatch
  2848. if (s.n_seq_id > 1) { break; }
  2849. // stop when there isn't enough space for another sequence
  2850. if (length + n_tokens_in_ubatch > n_ubatch) { break; }
  2851. }
  2852. }
  2853. return ubatch;
  2854. }
  2855. // sequence-wise split
  2856. llama_ubatch split_seq(size_t n_ubatch) {
  2857. n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
  2858. llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
  2859. if (!seq.empty()) {
  2860. llama_sbatch_seq & s = seq[seq.size() - 1];
  2861. size_t length = s.length < n_ubatch ? s.length : n_ubatch;
  2862. GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
  2863. add_seq_to_ubatch(ubatch, s, length);
  2864. }
  2865. return ubatch;
  2866. }
  2867. void from_batch(const llama_batch & batch, const size_t n_embd, const bool simple_split = false, const bool logits_all = false) {
  2868. GGML_ASSERT(batch.n_tokens >= 0);
  2869. this->batch = &batch;
  2870. this->n_embd = n_embd;
  2871. this->logits_all = logits_all;
  2872. n_tokens = batch.n_tokens;
  2873. ids.resize(n_tokens);
  2874. out_ids.clear();
  2875. // TODO: reserve out_ids and seq
  2876. for (size_t i = 0; i < n_tokens; ++i) {
  2877. ids[i] = i;
  2878. }
  2879. if (simple_split) {
  2880. seq.resize(1);
  2881. llama_sbatch_seq & s = seq[0];
  2882. s.n_seq_id = 0;
  2883. s.seq_id = nullptr;
  2884. s.offset = 0;
  2885. s.length = n_tokens;
  2886. s.all_seq_id = batch.all_seq_id;
  2887. return;
  2888. }
  2889. std::sort(ids.begin(), ids.end(),
  2890. [&batch](size_t a, size_t b) {
  2891. int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
  2892. int32_t n_seq_b = batch.n_seq_id ? batch.n_seq_id[b] : 1;
  2893. // sort by seq_id, then by pos
  2894. if (n_seq_a == n_seq_b) {
  2895. if (batch.seq_id) {
  2896. for (int32_t i = 0; i < n_seq_a; ++i) {
  2897. llama_seq_id seq_id_a = batch.seq_id[a][i];
  2898. llama_seq_id seq_id_b = batch.seq_id[b][i];
  2899. // smaller seq_ids go first
  2900. if (seq_id_a != seq_id_b) {
  2901. return seq_id_a < seq_id_b;
  2902. }
  2903. }
  2904. }
  2905. // when all else is equal, sort by pos
  2906. if (batch.pos) {
  2907. return batch.pos[a] < batch.pos[b];
  2908. }
  2909. // no pos, sort by id (assuming batch.all_pos_1 is positive)
  2910. return a < b;
  2911. }
  2912. // shared prompts go first
  2913. return n_seq_a > n_seq_b;
  2914. }
  2915. );
  2916. // init seq
  2917. llama_sbatch_seq * last_seq = nullptr;
  2918. if (batch.n_seq_id != nullptr && batch.seq_id != nullptr) {
  2919. for (size_t i = 0; i < n_tokens; ++i) {
  2920. const size_t bi = ids[i];
  2921. const int32_t n_seqs = batch.n_seq_id[bi];
  2922. llama_seq_id * seq_ids = batch.seq_id[bi];
  2923. if (last_seq != nullptr) {
  2924. bool same = n_seqs == last_seq->n_seq_id;
  2925. for (int32_t j = 0; same && j < n_seqs; ++j) {
  2926. if (seq_ids[j] != last_seq->seq_id[j]) {
  2927. same = false;
  2928. }
  2929. }
  2930. if (same) {
  2931. last_seq->length += 1;
  2932. continue;
  2933. }
  2934. }
  2935. llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1, batch.all_seq_id};
  2936. seq.push_back(new_seq);
  2937. last_seq = &seq.back();
  2938. }
  2939. } else {
  2940. llama_sbatch_seq new_seq = {1, nullptr, 0, n_tokens, batch.all_seq_id};
  2941. seq.push_back(new_seq);
  2942. }
  2943. // keep shared prompts first at the end, then sort by length descending.
  2944. std::sort(seq.begin(), seq.end(),
  2945. [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
  2946. if (a.n_seq_id == b.n_seq_id) {
  2947. return a.length > b.length;
  2948. }
  2949. return a.n_seq_id < b.n_seq_id;
  2950. }
  2951. );
  2952. }
  2953. };
  2954. struct llama_context {
  2955. llama_context(const llama_model & model)
  2956. : model(model)
  2957. , sampling(llama_n_vocab(&model))
  2958. , t_start_us(model.t_start_us)
  2959. , t_load_us(model.t_load_us) {}
  2960. ~llama_context() {
  2961. ggml_backend_sched_free(sched);
  2962. for (ggml_backend_t backend : backends) {
  2963. ggml_backend_free(backend);
  2964. }
  2965. ggml_backend_buffer_free(buf_output);
  2966. }
  2967. const struct llama_model & model;
  2968. struct llama_cparams cparams;
  2969. struct llama_sampling sampling;
  2970. struct llama_sbatch sbatch;
  2971. struct llama_kv_cache kv_self;
  2972. struct llama_control_vector cvec;
  2973. std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
  2974. std::vector<ggml_backend_t> backends;
  2975. #ifdef GGML_USE_METAL
  2976. ggml_backend_t backend_metal = nullptr;
  2977. #endif
  2978. #ifdef GGML_USE_BLAS
  2979. ggml_backend_t backend_blas = nullptr;
  2980. #endif
  2981. ggml_backend_t backend_cpu = nullptr;
  2982. ggml_threadpool_t threadpool = nullptr;
  2983. ggml_threadpool_t threadpool_batch = nullptr;
  2984. bool has_evaluated_once = false;
  2985. int64_t t_start_us;
  2986. int64_t t_load_us;
  2987. int64_t t_p_eval_us = 0;
  2988. int64_t t_eval_us = 0;
  2989. int64_t t_compute_start_us = 0;
  2990. int64_t n_queued_tokens = 0;
  2991. int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
  2992. int32_t n_eval = 0; // number of eval calls
  2993. // host buffer for the model output (logits and embeddings)
  2994. ggml_backend_buffer_t buf_output = nullptr;
  2995. // decode output (2-dimensional array: [n_outputs][n_vocab])
  2996. size_t logits_size = 0; // capacity (of floats) for logits
  2997. float * logits = nullptr;
  2998. std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
  2999. size_t output_size = 0; // capacity (of tokens positions) for the output buffers
  3000. int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
  3001. bool logits_all = false;
  3002. // embeddings output (2-dimensional array: [n_outputs][n_embd])
  3003. // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
  3004. size_t embd_size = 0; // capacity (of floats) for embeddings
  3005. float * embd = nullptr;
  3006. // sequence embeddings output (map of [n_embd] vectors)
  3007. // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
  3008. std::map<llama_seq_id, std::vector<float>> embd_seq;
  3009. // whether we are computing encoder output or decoder output
  3010. bool is_encoding = false;
  3011. // output of the encoder part of the encoder-decoder models
  3012. std::vector<float> embd_enc;
  3013. std::vector<std::set<llama_seq_id>> seq_ids_enc;
  3014. // memory buffers used to evaluate the model
  3015. std::vector<uint8_t> buf_compute_meta;
  3016. ggml_backend_sched_t sched = nullptr;
  3017. ggml_abort_callback abort_callback = nullptr;
  3018. void * abort_callback_data = nullptr;
  3019. // input tensors
  3020. struct ggml_tensor * inp_tokens; // I32 [n_batch]
  3021. struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
  3022. struct ggml_tensor * inp_pos; // I32 [n_batch]
  3023. struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
  3024. struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
  3025. struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
  3026. struct ggml_tensor * inp_K_shift; // I32 [kv_size]
  3027. struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
  3028. struct ggml_tensor * inp_cls; // I32 [n_batch]
  3029. struct ggml_tensor * inp_s_copy; // I32 [kv_size]
  3030. struct ggml_tensor * inp_s_mask; // F32 [1, n_kv]
  3031. struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
  3032. struct ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
  3033. struct ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
  3034. struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
  3035. };
  3036. struct llama_lora_weight {
  3037. struct ggml_tensor * a = nullptr;
  3038. struct ggml_tensor * b = nullptr;
  3039. llama_lora_weight() = default;
  3040. llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
  3041. };
  3042. struct llama_lora_adapter {
  3043. struct llama_model * base_model;
  3044. // map tensor name to lora_a_b
  3045. std::unordered_map<std::string, struct llama_lora_weight> ab_map;
  3046. std::vector<struct ggml_context *> ctxs;
  3047. std::vector<ggml_backend_buffer_t> bufs;
  3048. float alpha;
  3049. llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
  3050. base_model->lora_adapters.insert(this);
  3051. }
  3052. llama_lora_weight * get_weight(struct ggml_tensor * w) {
  3053. std::string name(w->name);
  3054. auto pos = ab_map.find(name);
  3055. if (ab_map.find(name) != ab_map.end()) {
  3056. return &pos->second;
  3057. }
  3058. return nullptr;
  3059. }
  3060. ~llama_lora_adapter() {
  3061. for (struct ggml_context * ctx : ctxs) {
  3062. ggml_free(ctx);
  3063. }
  3064. for (ggml_backend_buffer_t buf : bufs) {
  3065. ggml_backend_buffer_free(buf);
  3066. }
  3067. auto pos = base_model->lora_adapters.find(this);
  3068. if (pos != base_model->lora_adapters.end()) {
  3069. base_model->lora_adapters.erase(pos);
  3070. }
  3071. }
  3072. };
  3073. static size_t llama_get_device_count(const llama_model & model) {
  3074. size_t count = 1;
  3075. #if defined(GGML_USE_CUDA)
  3076. count = ggml_backend_cuda_get_device_count();
  3077. #elif defined(GGML_USE_SYCL)
  3078. count = ggml_backend_sycl_get_device_count();
  3079. #elif defined(GGML_USE_VULKAN)
  3080. count = ggml_backend_vk_get_device_count();
  3081. #elif defined(GGML_USE_CANN)
  3082. return ggml_backend_cann_get_device_count();
  3083. #endif
  3084. #if defined(GGML_USE_RPC)
  3085. count += model.rpc_servers.size();
  3086. #endif
  3087. return count;
  3088. GGML_UNUSED(model);
  3089. }
  3090. static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
  3091. ggml_backend_buffer_type_t buft = nullptr;
  3092. #if defined(GGML_USE_RPC)
  3093. int dev_count = (int)llama_get_device_count(model);
  3094. int rpc_count = (int)model.rpc_servers.size();
  3095. if (gpu >= dev_count - rpc_count) {
  3096. const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
  3097. return ggml_backend_rpc_buffer_type(endpoint);
  3098. }
  3099. #endif
  3100. #if defined(GGML_USE_METAL)
  3101. buft = ggml_backend_metal_buffer_type();
  3102. #elif defined(GGML_USE_CUDA)
  3103. buft = ggml_backend_cuda_buffer_type(gpu);
  3104. #elif defined(GGML_USE_VULKAN)
  3105. buft = ggml_backend_vk_buffer_type(gpu);
  3106. #elif defined(GGML_USE_SYCL)
  3107. buft = ggml_backend_sycl_buffer_type(gpu);
  3108. #elif defined(GGML_USE_KOMPUTE)
  3109. buft = ggml_backend_kompute_buffer_type(gpu);
  3110. if (buft == nullptr) {
  3111. LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
  3112. }
  3113. #elif defined(GGML_USE_CANN)
  3114. buft = ggml_backend_cann_buffer_type(gpu);
  3115. #endif
  3116. if (buft == nullptr) {
  3117. buft = llama_default_buffer_type_cpu(true);
  3118. }
  3119. return buft;
  3120. GGML_UNUSED(model);
  3121. GGML_UNUSED(gpu);
  3122. }
  3123. static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
  3124. ggml_backend_buffer_type_t buft = nullptr;
  3125. #ifdef GGML_USE_CUDA
  3126. if (ggml_backend_cuda_get_device_count() > 1) {
  3127. buft = ggml_backend_cuda_split_buffer_type(tensor_split);
  3128. }
  3129. #endif
  3130. #ifdef GGML_USE_SYCL
  3131. if (ggml_backend_sycl_get_device_count() > 1) {
  3132. buft = ggml_backend_sycl_split_buffer_type(tensor_split);
  3133. }
  3134. #endif
  3135. if (buft == nullptr) {
  3136. buft = llama_default_buffer_type_offload(model, fallback_gpu);
  3137. }
  3138. return buft;
  3139. GGML_UNUSED(tensor_split);
  3140. }
  3141. static size_t llama_get_device_memory(const llama_model & model, int device) {
  3142. #if defined(GGML_USE_RPC)
  3143. int dev_count = (int)llama_get_device_count(model);
  3144. int rpc_count = (int)model.rpc_servers.size();
  3145. if (device >= dev_count - rpc_count) {
  3146. size_t total;
  3147. size_t free;
  3148. const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
  3149. ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
  3150. return free;
  3151. }
  3152. #endif
  3153. #if defined(GGML_USE_CUDA)
  3154. size_t total;
  3155. size_t free;
  3156. ggml_backend_cuda_get_device_memory(device, &free, &total);
  3157. return free;
  3158. #elif defined(GGML_USE_SYCL)
  3159. size_t total;
  3160. size_t free;
  3161. ggml_backend_sycl_get_device_memory(device, &free, &total);
  3162. return free;
  3163. #elif defined(GGML_USE_VULKAN)
  3164. size_t total;
  3165. size_t free;
  3166. ggml_backend_vk_get_device_memory(device, &free, &total);
  3167. return free;
  3168. #elif defined(GGML_USE_CANN)
  3169. size_t total;
  3170. size_t free;
  3171. ggml_backend_cann_get_device_memory(device, &free, &total);
  3172. return free;
  3173. #else
  3174. return 1;
  3175. #endif
  3176. GGML_UNUSED(model);
  3177. GGML_UNUSED(device);
  3178. }
  3179. //
  3180. // kv cache helpers
  3181. //
  3182. static bool llama_kv_cache_init(
  3183. struct llama_kv_cache & cache,
  3184. const llama_context * ctx,
  3185. ggml_type type_k,
  3186. ggml_type type_v,
  3187. uint32_t kv_size,
  3188. bool offload) {
  3189. const llama_model & model = ctx->model;
  3190. const llama_cparams & cparams = ctx->cparams;
  3191. const struct llama_hparams & hparams = model.hparams;
  3192. const int64_t n_layer = hparams.n_layer;
  3193. cache.has_shift = false;
  3194. cache.recurrent = llama_model_is_recurrent(&model);
  3195. cache.v_trans = !cache.recurrent && !cparams.flash_attn;
  3196. cache.head = 0;
  3197. cache.size = kv_size;
  3198. cache.used = 0;
  3199. cache.type_k = type_k;
  3200. cache.type_v = type_v;
  3201. cache.cells.clear();
  3202. cache.cells.resize(kv_size);
  3203. // count used buffer types
  3204. std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
  3205. if (offload) {
  3206. for (int64_t i = 0; i < n_layer; ++i) {
  3207. buft_layer_count[model.buft_layer[i].buft]++;
  3208. }
  3209. } else {
  3210. buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
  3211. }
  3212. // create a context for each buffer type
  3213. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  3214. for (auto & it : buft_layer_count) {
  3215. int n_layers = it.second;
  3216. struct ggml_init_params params = {
  3217. /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
  3218. /*.mem_buffer =*/ NULL,
  3219. /*.no_alloc =*/ true,
  3220. };
  3221. ggml_context * ctx = ggml_init(params);
  3222. if (!ctx) {
  3223. LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
  3224. return false;
  3225. }
  3226. ctx_map[it.first] = ctx;
  3227. cache.ctxs.push_back(ctx);
  3228. }
  3229. cache.k_l.reserve(n_layer);
  3230. cache.v_l.reserve(n_layer);
  3231. for (int i = 0; i < (int) n_layer; i++) {
  3232. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
  3233. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
  3234. struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
  3235. ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
  3236. ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
  3237. ggml_format_name(k, "cache_k_l%d", i);
  3238. ggml_format_name(v, "cache_v_l%d", i);
  3239. cache.k_l.push_back(k);
  3240. cache.v_l.push_back(v);
  3241. }
  3242. // allocate tensors and initialize the buffers to avoid NaNs in the padding
  3243. for (auto it : ctx_map) {
  3244. ggml_backend_buffer_type_t buft = it.first;
  3245. ggml_context * ctx = it.second;
  3246. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  3247. if (!buf) {
  3248. LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
  3249. return false;
  3250. }
  3251. ggml_backend_buffer_clear(buf, 0);
  3252. LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
  3253. cache.bufs.push_back(buf);
  3254. }
  3255. return true;
  3256. }
  3257. // find an empty slot of size "n_tokens" in the cache
  3258. // updates the cache head
  3259. // Note: On success, it's important that cache.head points
  3260. // to the first cell of the slot.
  3261. static bool llama_kv_cache_find_slot(
  3262. struct llama_kv_cache & cache,
  3263. const struct llama_ubatch & batch) {
  3264. const uint32_t n_tokens = batch.n_tokens;
  3265. const uint32_t n_seqs = batch.n_seqs;
  3266. const uint32_t n_seq_tokens = batch.n_seq_tokens;
  3267. if (cache.recurrent) {
  3268. // For recurrent state architectures (like Mamba or RWKV),
  3269. // each cache cell can store the state for a whole sequence.
  3270. // A slot should be always be contiguous.
  3271. // can only process batches with an equal number of new tokens in each sequence
  3272. GGML_ASSERT(batch.equal_seqs);
  3273. int32_t min = cache.size - 1;
  3274. int32_t max = 0;
  3275. // everything should fit if all seq_ids are smaller than the max
  3276. for (uint32_t s = 0; s < n_seqs; ++s) {
  3277. const uint32_t n_seq_id = batch.n_seq_id[s];
  3278. for (uint32_t j = 0; j < n_seq_id; ++j) {
  3279. const llama_seq_id seq_id = batch.seq_id[s][j];
  3280. if (seq_id < 0 || (uint32_t) seq_id >= cache.size) {
  3281. // too big seq_id
  3282. // TODO: would it be possible to resize the cache instead?
  3283. LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
  3284. return false;
  3285. }
  3286. if (j > 0) {
  3287. llama_kv_cell & seq = cache.cells[seq_id];
  3288. if (seq.tail >= 0) {
  3289. llama_kv_cell & cell = cache.cells[seq.tail];
  3290. // clear cells from seq_ids that become shared
  3291. // (should not normally happen, but let's handle it anyway)
  3292. cell.seq_id.erase(seq_id);
  3293. seq.tail = -1;
  3294. if (cell.seq_id.empty()) {
  3295. cell.pos = -1;
  3296. cell.src = -1;
  3297. cache.used -= 1;
  3298. }
  3299. }
  3300. }
  3301. }
  3302. }
  3303. #ifndef NDEBUG
  3304. {
  3305. std::vector<int32_t> tails_verif;
  3306. tails_verif.assign(cache.size, -1);
  3307. for (uint32_t i = 0; i < cache.size; ++i) {
  3308. llama_kv_cell & cell = cache.cells[i];
  3309. for (llama_seq_id seq_id : cell.seq_id) {
  3310. if (tails_verif[seq_id] != -1) {
  3311. LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tails_verif[seq_id]);
  3312. }
  3313. tails_verif[seq_id] = i;
  3314. }
  3315. }
  3316. for (uint32_t i = 0; i < cache.size; ++i) {
  3317. if (tails_verif[i] != cache.cells[i].tail) {
  3318. LLAMA_LOG_ERROR("%s: wrong tail for seq_id %d, (%d instead of %d)\n", __func__, i, cache.cells[i].tail, tails_verif[i]);
  3319. }
  3320. }
  3321. }
  3322. #endif
  3323. // find next empty cell
  3324. uint32_t next_empty_cell = cache.head;
  3325. for (uint32_t i = 0; i < cache.size; ++i) {
  3326. if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
  3327. llama_kv_cell & cell = cache.cells[next_empty_cell];
  3328. if (cell.is_empty()) { break; }
  3329. next_empty_cell += 1;
  3330. }
  3331. // find usable cell range
  3332. for (uint32_t s = 0; s < n_seqs; ++s) {
  3333. const llama_seq_id seq_id = batch.seq_id[s][0];
  3334. llama_kv_cell & seq_meta = cache.cells[seq_id];
  3335. bool has_cell = false;
  3336. if (seq_meta.tail >= 0) {
  3337. llama_kv_cell & cell = cache.cells[seq_meta.tail];
  3338. GGML_ASSERT(cell.has_seq_id(seq_id));
  3339. // does this seq_id "own" the cell?
  3340. if (cell.seq_id.size() == 1) { has_cell = true; }
  3341. }
  3342. if (!has_cell) {
  3343. llama_kv_cell & empty_cell = cache.cells[next_empty_cell];
  3344. GGML_ASSERT(empty_cell.is_empty());
  3345. // copy old tail into the empty cell
  3346. if (seq_meta.tail >= 0) {
  3347. llama_kv_cell & orig_cell = cache.cells[seq_meta.tail];
  3348. empty_cell.pos = orig_cell.pos;
  3349. empty_cell.src = orig_cell.src;
  3350. orig_cell.seq_id.erase(seq_id);
  3351. empty_cell.seq_id.insert(seq_id); // will be overwritten
  3352. }
  3353. seq_meta.tail = next_empty_cell;
  3354. // find next empty cell
  3355. if (s + 1 < n_seqs) {
  3356. next_empty_cell += 1;
  3357. for (uint32_t i = 0; i < cache.size; ++i) {
  3358. if (next_empty_cell >= cache.size) { next_empty_cell -= cache.size; }
  3359. llama_kv_cell & cell = cache.cells[next_empty_cell];
  3360. if (cell.is_empty()) { break; }
  3361. next_empty_cell += 1;
  3362. }
  3363. }
  3364. }
  3365. if (min > seq_meta.tail) { min = seq_meta.tail; }
  3366. if (max < seq_meta.tail) { max = seq_meta.tail; }
  3367. }
  3368. // gather and re-order
  3369. for (uint32_t s = 0; s < n_seqs; ++s) {
  3370. int32_t dst_id = s + min;
  3371. int32_t src_id = cache.cells[batch.seq_id[s][0]].tail;
  3372. if (dst_id != src_id) {
  3373. llama_kv_cell & dst_cell = cache.cells[dst_id];
  3374. llama_kv_cell & src_cell = cache.cells[src_id];
  3375. std::swap(dst_cell.pos, src_cell.pos);
  3376. std::swap(dst_cell.src, src_cell.src);
  3377. std::swap(dst_cell.seq_id, src_cell.seq_id);
  3378. // swap tails (assuming they NEVER overlap)
  3379. for (const llama_seq_id seq_id : src_cell.seq_id) {
  3380. cache.cells[seq_id].tail = src_id;
  3381. }
  3382. for (const llama_seq_id seq_id : dst_cell.seq_id) {
  3383. cache.cells[seq_id].tail = dst_id;
  3384. }
  3385. }
  3386. }
  3387. // update the pos of the used seqs
  3388. for (uint32_t s = 0; s < n_seqs; ++s) {
  3389. const llama_pos last_pos = batch.pos[n_seq_tokens * s + n_seq_tokens - 1];
  3390. int32_t cell_id = s + min;
  3391. llama_kv_cell & cell = cache.cells[cell_id];
  3392. if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
  3393. // What should happen when the pos backtracks or skips a value?
  3394. // Clearing the state mid-batch would require special-casing which isn't done.
  3395. LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d with %u new tokens\n",
  3396. __func__, last_pos, cell.pos, batch.seq_id[s][0], n_seq_tokens);
  3397. }
  3398. cell.pos = last_pos;
  3399. cell.seq_id.clear();
  3400. for (int32_t j = 0; j < batch.n_seq_id[s]; ++j) {
  3401. const llama_seq_id seq_id = batch.seq_id[s][j];
  3402. cell.seq_id.insert(seq_id);
  3403. cache.cells[seq_id].tail = cell_id;
  3404. }
  3405. }
  3406. // allow getting the range of used cells, from head to head + n
  3407. cache.head = min;
  3408. cache.n = max - min + 1;
  3409. // sanity check
  3410. return cache.n >= n_seqs;
  3411. }
  3412. // otherwise, one cell per token.
  3413. if (n_tokens > cache.size) {
  3414. LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
  3415. return false;
  3416. }
  3417. uint32_t n_tested = 0;
  3418. while (true) {
  3419. if (cache.head + n_tokens > cache.size) {
  3420. n_tested += cache.size - cache.head;
  3421. cache.head = 0;
  3422. continue;
  3423. }
  3424. bool found = true;
  3425. for (uint32_t i = 0; i < n_tokens; i++) {
  3426. if (cache.cells[cache.head + i].pos >= 0) {
  3427. found = false;
  3428. cache.head += i + 1;
  3429. n_tested += i + 1;
  3430. break;
  3431. }
  3432. }
  3433. if (found) {
  3434. break;
  3435. }
  3436. if (n_tested >= cache.size) {
  3437. //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
  3438. return false;
  3439. }
  3440. }
  3441. for (uint32_t s = 0; s < n_seqs; s++) {
  3442. for (uint32_t i = 0; i < n_seq_tokens; ++i) {
  3443. uint32_t k = s*n_seq_tokens + i;
  3444. cache.cells[cache.head + k].pos = batch.pos[k];
  3445. for (int32_t j = 0; j < batch.n_seq_id[s]; j++) {
  3446. cache.cells[cache.head + k].seq_id.insert(batch.seq_id[s][j]);
  3447. }
  3448. }
  3449. }
  3450. cache.used += n_tokens;
  3451. return true;
  3452. }
  3453. // find how many cells are currently in use
  3454. static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
  3455. for (uint32_t i = cache.size; i > 0; --i) {
  3456. const llama_kv_cell & cell = cache.cells[i - 1];
  3457. if (cell.pos >= 0 && !cell.is_empty()) {
  3458. return i;
  3459. }
  3460. }
  3461. return 0;
  3462. }
  3463. static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
  3464. for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
  3465. cache.cells[i].pos = -1;
  3466. cache.cells[i].seq_id.clear();
  3467. cache.cells[i].src = -1;
  3468. cache.cells[i].tail = -1;
  3469. }
  3470. cache.head = 0;
  3471. cache.used = 0;
  3472. for (auto & buf : cache.bufs) {
  3473. ggml_backend_buffer_clear(buf, 0);
  3474. }
  3475. }
  3476. static bool llama_kv_cache_seq_rm(
  3477. struct llama_kv_cache & cache,
  3478. llama_seq_id seq_id,
  3479. llama_pos p0,
  3480. llama_pos p1) {
  3481. uint32_t new_head = cache.size;
  3482. if (p0 < 0) p0 = 0;
  3483. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  3484. // models like Mamba or RWKV can't have a state partially erased
  3485. if (cache.recurrent) {
  3486. if (seq_id >= (int64_t) cache.size) {
  3487. // could be fatal
  3488. return false;
  3489. }
  3490. if (0 <= seq_id) {
  3491. int32_t & tail_id = cache.cells[seq_id].tail;
  3492. if (tail_id >= 0) {
  3493. const llama_kv_cell & cell = cache.cells[tail_id];
  3494. // partial intersection is invalid
  3495. if ((0 < p0 && p0 <= cell.pos) || (0 < p1 && p1 <= cell.pos)) {
  3496. return false;
  3497. }
  3498. // invalidate tails which will be cleared
  3499. if (p0 <= cell.pos && cell.pos < p1) {
  3500. tail_id = -1;
  3501. }
  3502. }
  3503. } else {
  3504. // seq_id is negative, then the range should include everything or nothing
  3505. if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
  3506. return false;
  3507. }
  3508. }
  3509. }
  3510. for (uint32_t i = 0; i < cache.size; ++i) {
  3511. if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  3512. if (seq_id < 0) {
  3513. cache.cells[i].seq_id.clear();
  3514. } else if (cache.cells[i].has_seq_id(seq_id)) {
  3515. cache.cells[i].seq_id.erase(seq_id);
  3516. } else {
  3517. continue;
  3518. }
  3519. if (cache.cells[i].is_empty()) {
  3520. // keep count of the number of used cells
  3521. if (cache.cells[i].pos >= 0) cache.used--;
  3522. cache.cells[i].pos = -1;
  3523. cache.cells[i].src = -1;
  3524. if (new_head == cache.size) new_head = i;
  3525. }
  3526. }
  3527. }
  3528. // If we freed up a slot, set head to it so searching can start there.
  3529. if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
  3530. return true;
  3531. }
  3532. static void llama_kv_cache_seq_cp(
  3533. struct llama_kv_cache & cache,
  3534. llama_seq_id seq_id_src,
  3535. llama_seq_id seq_id_dst,
  3536. llama_pos p0,
  3537. llama_pos p1) {
  3538. if (p0 < 0) p0 = 0;
  3539. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  3540. if (cache.recurrent) {
  3541. if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
  3542. llama_kv_cell & tail_src = cache.cells[seq_id_src];
  3543. llama_kv_cell & tail_dst = cache.cells[seq_id_dst];
  3544. if (tail_dst.tail >= 0) {
  3545. // clear destination seq_id if it wasn't empty
  3546. llama_kv_cell & cell_dst = cache.cells[tail_dst.tail];
  3547. cell_dst.seq_id.erase(seq_id_dst);
  3548. tail_dst.tail = -1;
  3549. if (cell_dst.seq_id.empty()) {
  3550. cell_dst.pos = -1;
  3551. cell_dst.delta = -1;
  3552. cell_dst.src = -1;
  3553. cache.used -= 1;
  3554. }
  3555. }
  3556. if (tail_src.tail >= 0) {
  3557. llama_kv_cell & cell_src = cache.cells[tail_src.tail];
  3558. cell_src.seq_id.insert(seq_id_dst);
  3559. tail_dst.tail = tail_src.tail;
  3560. }
  3561. }
  3562. return;
  3563. }
  3564. // otherwise, this is the KV cache of a Transformer-like model
  3565. cache.head = 0;
  3566. for (uint32_t i = 0; i < cache.size; ++i) {
  3567. if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  3568. cache.cells[i].seq_id.insert(seq_id_dst);
  3569. }
  3570. }
  3571. }
  3572. static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
  3573. uint32_t new_head = cache.size;
  3574. for (uint32_t i = 0; i < cache.size; ++i) {
  3575. if (cache.recurrent && (llama_seq_id) i != seq_id) {
  3576. cache.cells[i].tail = -1;
  3577. }
  3578. if (!cache.cells[i].has_seq_id(seq_id)) {
  3579. if (cache.cells[i].pos >= 0) cache.used--;
  3580. cache.cells[i].pos = -1;
  3581. cache.cells[i].src = -1;
  3582. cache.cells[i].seq_id.clear();
  3583. if (new_head == cache.size) new_head = i;
  3584. } else {
  3585. cache.cells[i].seq_id.clear();
  3586. cache.cells[i].seq_id.insert(seq_id);
  3587. }
  3588. }
  3589. // If we freed up a slot, set head to it so searching can start there.
  3590. if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
  3591. }
  3592. static void llama_kv_cache_seq_add(
  3593. struct llama_kv_cache & cache,
  3594. llama_seq_id seq_id,
  3595. llama_pos p0,
  3596. llama_pos p1,
  3597. llama_pos delta) {
  3598. uint32_t new_head = cache.size;
  3599. if (p0 < 0) p0 = 0;
  3600. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  3601. // If there is no range then return early to avoid looping over the cache.
  3602. if (p0 == p1) return;
  3603. if (cache.recurrent) {
  3604. // for Mamba-like or RWKV models, only the pos needs to be shifted
  3605. if (0 <= seq_id && seq_id < (int64_t) cache.size) {
  3606. const int32_t tail_id = cache.cells[seq_id].tail;
  3607. if (tail_id >= 0) {
  3608. llama_kv_cell & cell = cache.cells[tail_id];
  3609. if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
  3610. cell.pos += delta;
  3611. }
  3612. }
  3613. }
  3614. return;
  3615. }
  3616. for (uint32_t i = 0; i < cache.size; ++i) {
  3617. if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  3618. cache.has_shift = true;
  3619. cache.cells[i].pos += delta;
  3620. cache.cells[i].delta += delta;
  3621. if (cache.cells[i].pos < 0) {
  3622. if (!cache.cells[i].is_empty()) {
  3623. cache.used--;
  3624. }
  3625. cache.cells[i].pos = -1;
  3626. cache.cells[i].seq_id.clear();
  3627. if (new_head == cache.size) {
  3628. new_head = i;
  3629. }
  3630. }
  3631. }
  3632. }
  3633. // If we freed up a slot, set head to it so searching can start there.
  3634. // Otherwise we just start the next search from the beginning.
  3635. cache.head = new_head != cache.size ? new_head : 0;
  3636. }
  3637. static void llama_kv_cache_seq_div(
  3638. struct llama_kv_cache & cache,
  3639. llama_seq_id seq_id,
  3640. llama_pos p0,
  3641. llama_pos p1,
  3642. int d) {
  3643. if (p0 < 0) p0 = 0;
  3644. if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
  3645. // If there is no range then return early to avoid looping over the cache.
  3646. if (p0 == p1) return;
  3647. if (cache.recurrent) {
  3648. // for Mamba-like or RWKV models, only the pos needs to be changed
  3649. if (0 <= seq_id && seq_id < (int64_t) cache.size) {
  3650. const int32_t tail_id = cache.cells[seq_id].tail;
  3651. if (tail_id >= 0) {
  3652. llama_kv_cell & cell = cache.cells[tail_id];
  3653. if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
  3654. cell.pos /= d;
  3655. }
  3656. }
  3657. }
  3658. return;
  3659. }
  3660. for (uint32_t i = 0; i < cache.size; ++i) {
  3661. if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
  3662. cache.has_shift = true;
  3663. {
  3664. llama_pos p_old = cache.cells[i].pos;
  3665. cache.cells[i].pos /= d;
  3666. cache.cells[i].delta += cache.cells[i].pos - p_old;
  3667. }
  3668. }
  3669. }
  3670. }
  3671. static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
  3672. llama_pos result = 0;
  3673. for (uint32_t i = 0; i < cache.size; ++i) {
  3674. if (cache.cells[i].has_seq_id(seq_id)) {
  3675. result = std::max(result, cache.cells[i].pos);
  3676. }
  3677. }
  3678. return result;
  3679. }
  3680. static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
  3681. if (!cache.recurrent) {
  3682. cache.do_defrag = true;
  3683. }
  3684. }
  3685. static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
  3686. // the FA kernels require padding to avoid extra runtime boundary checks
  3687. return cparams.flash_attn ? 256u : 32u;
  3688. }
  3689. //
  3690. // model loading and saving
  3691. //
  3692. enum llama_fver {
  3693. GGUF_FILE_VERSION_V1 = 1,
  3694. GGUF_FILE_VERSION_V2 = 2,
  3695. GGUF_FILE_VERSION_V3 = 3,
  3696. };
  3697. static const char * llama_file_version_name(llama_fver version) {
  3698. switch (version) {
  3699. case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
  3700. case GGUF_FILE_VERSION_V2: return "GGUF V2";
  3701. case GGUF_FILE_VERSION_V3: return "GGUF V3 (latest)";
  3702. }
  3703. return "unknown";
  3704. }
  3705. static std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
  3706. char buf[256];
  3707. snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
  3708. for (size_t i = 1; i < ne.size(); i++) {
  3709. snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
  3710. }
  3711. return buf;
  3712. }
  3713. static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
  3714. char buf[256];
  3715. snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
  3716. for (int i = 1; i < GGML_MAX_DIMS; i++) {
  3717. snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
  3718. }
  3719. return buf;
  3720. }
  3721. namespace GGUFMeta {
  3722. template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
  3723. struct GKV_Base_Type {
  3724. static constexpr gguf_type gt = gt_;
  3725. static T getter(const gguf_context * ctx, const int kid) {
  3726. return gfun(ctx, kid);
  3727. }
  3728. };
  3729. template<typename T> struct GKV_Base;
  3730. template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
  3731. template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
  3732. template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
  3733. template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
  3734. template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
  3735. template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
  3736. template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
  3737. template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
  3738. template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
  3739. template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
  3740. template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
  3741. template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
  3742. template<> struct GKV_Base<std::string> {
  3743. static constexpr gguf_type gt = GGUF_TYPE_STRING;
  3744. static std::string getter(const gguf_context * ctx, const int kid) {
  3745. return gguf_get_val_str(ctx, kid);
  3746. }
  3747. };
  3748. struct ArrayInfo {
  3749. const gguf_type gt;
  3750. const size_t length;
  3751. const void * data;
  3752. };
  3753. template<> struct GKV_Base<ArrayInfo> {
  3754. public:
  3755. static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
  3756. static ArrayInfo getter(const gguf_context *ctx, const int k) {
  3757. return ArrayInfo {
  3758. gguf_get_arr_type(ctx, k),
  3759. size_t(gguf_get_arr_n(ctx, k)),
  3760. gguf_get_arr_data(ctx, k),
  3761. };
  3762. }
  3763. };
  3764. template<typename T>
  3765. class GKV : public GKV_Base<T> {
  3766. GKV() = delete;
  3767. public:
  3768. static T get_kv(const gguf_context * ctx, const int k) {
  3769. const enum gguf_type kt = gguf_get_kv_type(ctx, k);
  3770. if (kt != GKV::gt) {
  3771. throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
  3772. gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
  3773. }
  3774. return GKV::getter(ctx, k);
  3775. }
  3776. static const char * override_type_to_str(const llama_model_kv_override_type ty) {
  3777. switch (ty) {
  3778. case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
  3779. case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
  3780. case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
  3781. case LLAMA_KV_OVERRIDE_TYPE_STR: return "str";
  3782. }
  3783. return "unknown";
  3784. }
  3785. static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
  3786. if (!ovrd) { return false; }
  3787. if (ovrd->tag == expected_type) {
  3788. LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
  3789. __func__, override_type_to_str(ovrd->tag), ovrd->key);
  3790. switch (ovrd->tag) {
  3791. case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
  3792. LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
  3793. } break;
  3794. case LLAMA_KV_OVERRIDE_TYPE_INT: {
  3795. LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
  3796. } break;
  3797. case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
  3798. LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
  3799. } break;
  3800. case LLAMA_KV_OVERRIDE_TYPE_STR: {
  3801. LLAMA_LOG_INFO("%s\n", ovrd->val_str);
  3802. } break;
  3803. default:
  3804. // Shouldn't be possible to end up here, but just in case...
  3805. throw std::runtime_error(
  3806. format("Unsupported attempt to override %s type for metadata key %s\n",
  3807. override_type_to_str(ovrd->tag), ovrd->key));
  3808. }
  3809. return true;
  3810. }
  3811. LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
  3812. __func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
  3813. return false;
  3814. }
  3815. template<typename OT>
  3816. static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
  3817. try_override(OT & target, const struct llama_model_kv_override * ovrd) {
  3818. if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
  3819. target = ovrd->val_bool;
  3820. return true;
  3821. }
  3822. return false;
  3823. }
  3824. template<typename OT>
  3825. static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
  3826. try_override(OT & target, const struct llama_model_kv_override * ovrd) {
  3827. if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
  3828. target = ovrd->val_i64;
  3829. return true;
  3830. }
  3831. return false;
  3832. }
  3833. template<typename OT>
  3834. static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
  3835. try_override(T & target, const struct llama_model_kv_override * ovrd) {
  3836. if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
  3837. target = ovrd->val_f64;
  3838. return true;
  3839. }
  3840. return false;
  3841. }
  3842. template<typename OT>
  3843. static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
  3844. try_override(T & target, const struct llama_model_kv_override * ovrd) {
  3845. if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
  3846. target = ovrd->val_str;
  3847. return true;
  3848. }
  3849. return false;
  3850. }
  3851. static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
  3852. if (try_override<T>(target, ovrd)) {
  3853. return true;
  3854. }
  3855. if (k < 0) { return false; }
  3856. target = get_kv(ctx, k);
  3857. return true;
  3858. }
  3859. static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
  3860. return set(ctx, gguf_find_key(ctx, key), target, ovrd);
  3861. }
  3862. static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
  3863. return set(ctx, key.c_str(), target, ovrd);
  3864. }
  3865. };
  3866. }
  3867. using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
  3868. static size_t llama_model_max_nodes(const llama_model & model) {
  3869. return std::max<size_t>(8192, model.tensors_by_name.size()*5);
  3870. }
  3871. struct llama_model_loader {
  3872. int n_kv = 0;
  3873. int n_tensors = 0;
  3874. int n_created = 0;
  3875. int64_t n_elements = 0;
  3876. size_t n_bytes = 0;
  3877. bool use_mmap = false;
  3878. bool check_tensors;
  3879. llama_files files;
  3880. llama_ftype ftype;
  3881. llama_fver fver;
  3882. llama_mmaps mappings;
  3883. // Holds information on a model weight
  3884. struct llama_tensor_weight {
  3885. uint16_t idx; // source file index
  3886. size_t offs; // tensor data offset in the original file
  3887. ggml_tensor * tensor;
  3888. llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
  3889. const int tensor_idx = gguf_find_tensor(gguf_ctx, name);
  3890. offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx);
  3891. if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) {
  3892. throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name));
  3893. }
  3894. }
  3895. };
  3896. std::vector<llama_tensor_weight> weights;
  3897. std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
  3898. struct gguf_context * meta = NULL;
  3899. std::vector<ggml_context *> contexts;
  3900. std::string arch_name;
  3901. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  3902. llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
  3903. int trace = 0;
  3904. if (getenv("LLAMA_TRACE")) {
  3905. trace = atoi(getenv("LLAMA_TRACE"));
  3906. }
  3907. if (param_overrides_p != nullptr) {
  3908. for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) {
  3909. kv_overrides.insert({std::string(p->key), *p});
  3910. }
  3911. }
  3912. struct ggml_context * ctx = NULL;
  3913. struct gguf_init_params params = {
  3914. /*.no_alloc = */ true,
  3915. /*.ctx = */ &ctx,
  3916. };
  3917. meta = gguf_init_from_file(fname.c_str(), params);
  3918. if (!meta) {
  3919. throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
  3920. }
  3921. get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
  3922. llm_kv = LLM_KV(llm_arch_from_string(arch_name));
  3923. files.emplace_back(new llama_file(fname.c_str(), "rb"));
  3924. contexts.emplace_back(ctx);
  3925. // Save tensors data offset of the main file.
  3926. // For subsidiary files, `meta` tensor data offset must not be used,
  3927. // so we build a unified tensors index for weights.
  3928. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
  3929. weights.emplace_back(files.back().get(), 0, cur->name, meta, cur);
  3930. }
  3931. uint16_t n_split = 0;
  3932. get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false);
  3933. // Load additional GGML contexts
  3934. if (n_split > 1) {
  3935. uint16_t idx = 0;
  3936. get_key(llm_kv(LLM_KV_SPLIT_NO), idx);
  3937. if (idx != 0) {
  3938. throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx));
  3939. }
  3940. char split_prefix[PATH_MAX] = {0};
  3941. if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
  3942. throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
  3943. }
  3944. if (trace > 0) {
  3945. LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split);
  3946. }
  3947. char split_path[PATH_MAX] = {0};
  3948. for (idx = 1; idx < n_split; idx++) {
  3949. llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
  3950. struct gguf_init_params split_params = {
  3951. /*.no_alloc = */ true,
  3952. /*.ctx = */ &ctx,
  3953. };
  3954. struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params);
  3955. if (!ctx_gguf) {
  3956. throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path));
  3957. }
  3958. files.emplace_back(new llama_file(split_path, "rb"));
  3959. contexts.emplace_back(ctx);
  3960. // Save tensors data offset info of the shard.
  3961. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
  3962. weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur);
  3963. }
  3964. gguf_free(ctx_gguf);
  3965. }
  3966. get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors);
  3967. // sanity check
  3968. {
  3969. const int n_tensors_loaded = (int) weights.size();
  3970. if (n_tensors != n_tensors_loaded) {
  3971. throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded));
  3972. }
  3973. }
  3974. LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1);
  3975. }
  3976. n_kv = gguf_get_n_kv(meta);
  3977. n_tensors = weights.size();
  3978. fver = (enum llama_fver) gguf_get_version(meta);
  3979. std::set<std::string> tensor_names;
  3980. for (auto & w : weights) {
  3981. n_elements += ggml_nelements(w.tensor);
  3982. n_bytes += ggml_nbytes(w.tensor);
  3983. // make sure there is no duplicated tensor names
  3984. const std::string name(w.tensor->name);
  3985. auto found = tensor_names.find(name);
  3986. if (found != tensor_names.end()) {
  3987. throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
  3988. }
  3989. tensor_names.insert(name);
  3990. }
  3991. LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
  3992. __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver));
  3993. // determine file type based on the number of tensors for each quantization and print meta data
  3994. // TODO: make optional
  3995. {
  3996. std::map<enum ggml_type, uint32_t> n_type;
  3997. uint32_t n_type_max = 0;
  3998. enum ggml_type type_max = GGML_TYPE_F32;
  3999. for (int i = 0; i < n_tensors; i++) {
  4000. const ggml_tensor * tensor = weights.at(i).tensor;
  4001. enum ggml_type type = tensor->type;
  4002. n_type[type]++;
  4003. if (n_type_max < n_type[type]) {
  4004. n_type_max = n_type[type];
  4005. type_max = type;
  4006. }
  4007. if (trace > 0) {
  4008. const uint16_t sid = weights.at(i).idx;
  4009. LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str());
  4010. }
  4011. }
  4012. switch (type_max) {
  4013. case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
  4014. case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
  4015. case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
  4016. case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
  4017. case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
  4018. case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
  4019. case GGML_TYPE_Q5_1: ftype = LLAMA_FTYPE_MOSTLY_Q5_1; break;
  4020. case GGML_TYPE_Q8_0: ftype = LLAMA_FTYPE_MOSTLY_Q8_0; break;
  4021. case GGML_TYPE_Q2_K: ftype = LLAMA_FTYPE_MOSTLY_Q2_K; break;
  4022. case GGML_TYPE_Q3_K: ftype = LLAMA_FTYPE_MOSTLY_Q3_K_M; break;
  4023. case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
  4024. case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
  4025. case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
  4026. case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
  4027. case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
  4028. case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
  4029. case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
  4030. case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
  4031. case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break;
  4032. case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
  4033. case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
  4034. case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
  4035. case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
  4036. case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
  4037. case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
  4038. default:
  4039. {
  4040. LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
  4041. ftype = LLAMA_FTYPE_ALL_F32;
  4042. } break;
  4043. }
  4044. // this is a way to mark that we have "guessed" the file type
  4045. ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED);
  4046. {
  4047. const int kid = gguf_find_key(meta, "general.file_type"); // TODO: use LLM_KV
  4048. if (kid >= 0) {
  4049. ftype = (llama_ftype) gguf_get_val_u32(meta, kid);
  4050. }
  4051. }
  4052. LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
  4053. for (int i = 0; i < n_kv; i++) {
  4054. const char * name = gguf_get_key(meta, i);
  4055. const enum gguf_type type = gguf_get_kv_type(meta, i);
  4056. const std::string type_name =
  4057. type == GGUF_TYPE_ARRAY
  4058. ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i))
  4059. : gguf_type_name(type);
  4060. std::string value = gguf_kv_to_str(meta, i);
  4061. const size_t MAX_VALUE_LEN = 40;
  4062. if (value.size() > MAX_VALUE_LEN) {
  4063. value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
  4064. }
  4065. replace_all(value, "\n", "\\n");
  4066. LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
  4067. }
  4068. // print type counts
  4069. for (auto & kv : n_type) {
  4070. if (kv.second == 0) {
  4071. continue;
  4072. }
  4073. LLAMA_LOG_INFO("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
  4074. }
  4075. }
  4076. if (!llama_mmap::SUPPORTED) {
  4077. LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
  4078. use_mmap = false;
  4079. }
  4080. this->use_mmap = use_mmap;
  4081. this->check_tensors = check_tensors;
  4082. }
  4083. ~llama_model_loader() {
  4084. if (meta) {
  4085. gguf_free(meta);
  4086. }
  4087. for (auto * ctx : contexts) {
  4088. ggml_free(ctx);
  4089. }
  4090. }
  4091. template<typename T>
  4092. typename std::enable_if<std::is_integral<T>::value, bool>::type
  4093. get_arr_n(const std::string & key, T & result, const bool required = true) {
  4094. const int kid = gguf_find_key(meta, key.c_str());
  4095. if (kid < 0) {
  4096. if (required) {
  4097. throw std::runtime_error(format("key not found in model: %s", key.c_str()));
  4098. }
  4099. return false;
  4100. }
  4101. struct GGUFMeta::ArrayInfo arr_info =
  4102. GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
  4103. result = arr_info.length;
  4104. return true;
  4105. }
  4106. template<typename T>
  4107. typename std::enable_if<std::is_integral<T>::value, bool>::type
  4108. get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
  4109. return get_arr_n(llm_kv(kid), result, required);
  4110. }
  4111. template<typename T>
  4112. bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
  4113. const int kid = gguf_find_key(meta, key.c_str());
  4114. if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
  4115. if (required) {
  4116. throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
  4117. }
  4118. return false;
  4119. }
  4120. struct GGUFMeta::ArrayInfo arr_info =
  4121. GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
  4122. switch (arr_info.gt) {
  4123. case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
  4124. case GGUF_TYPE_INT32: GGML_ASSERT(
  4125. (std::is_same<T, int32_t>::value) ||
  4126. (std::is_same<T, uint32_t>::value)); break;
  4127. default:
  4128. throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
  4129. }
  4130. result.resize(arr_info.length);
  4131. result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
  4132. return true;
  4133. }
  4134. template<typename T, size_t N_MAX>
  4135. bool get_arr(const std::string & key, std::array<T, N_MAX> & result, const bool required = true) {
  4136. const int kid = gguf_find_key(meta, key.c_str());
  4137. if (kid < 0 || gguf_get_kv_type(meta, kid) != GGUF_TYPE_ARRAY) {
  4138. if (required) {
  4139. throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
  4140. }
  4141. return false;
  4142. }
  4143. struct GGUFMeta::ArrayInfo arr_info =
  4144. GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
  4145. switch (arr_info.gt) {
  4146. case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
  4147. case GGUF_TYPE_INT32: GGML_ASSERT(
  4148. (std::is_same<T, int32_t>::value) ||
  4149. (std::is_same<T, uint32_t>::value)); break;
  4150. default:
  4151. throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
  4152. }
  4153. if (arr_info.length > N_MAX) {
  4154. throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
  4155. }
  4156. std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
  4157. return true;
  4158. }
  4159. template<typename T>
  4160. bool get_arr(const enum llm_kv kid, T & result, const bool required = true) {
  4161. return get_arr(llm_kv(kid), result, required);
  4162. }
  4163. template<typename T>
  4164. bool get_key(const std::string & key, T & result, const bool required = true) {
  4165. auto it = kv_overrides.find(key);
  4166. const struct llama_model_kv_override * override =
  4167. it != kv_overrides.end() ? &it->second : nullptr;
  4168. const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
  4169. if (required && !found) {
  4170. throw std::runtime_error(format("key not found in model: %s", key.c_str()));
  4171. }
  4172. return found;
  4173. }
  4174. template<typename T>
  4175. bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
  4176. return get_key(llm_kv(kid), result, required);
  4177. }
  4178. // get array of n <= N_MAX elements, or a single element repeated n times
  4179. template<typename T, size_t N_MAX>
  4180. bool get_key_or_arr(const std::string & key, std::array<T, N_MAX> & result, uint32_t n, const bool required = true) {
  4181. const int kid = gguf_find_key(meta, key.c_str());
  4182. if (kid < 0) {
  4183. if (required) {
  4184. throw std::runtime_error(format("key not found in model: %s", key.c_str()));
  4185. }
  4186. return false;
  4187. }
  4188. if (n > N_MAX) {
  4189. throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str()));
  4190. }
  4191. if (gguf_get_kv_type(meta, kid) == GGUF_TYPE_ARRAY) {
  4192. struct GGUFMeta::ArrayInfo arr_info =
  4193. GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
  4194. if (n != arr_info.length) {
  4195. throw std::runtime_error(format("key %s has wrong array length; expected %u, got %u", key.c_str(), n, (uint32_t) arr_info.length));
  4196. }
  4197. return get_arr(key, result, required);
  4198. } else {
  4199. T value;
  4200. bool ok = get_key(key, value, required);
  4201. if (!ok) {
  4202. return false;
  4203. }
  4204. for (uint32_t i = 0; i < n; i++) {
  4205. result[i] = value;
  4206. }
  4207. return true;
  4208. }
  4209. }
  4210. template<typename T>
  4211. bool get_key_or_arr(const enum llm_kv kid, T & result, uint32_t n, const bool required = true) {
  4212. return get_key_or_arr(llm_kv(kid), result, n, required);
  4213. }
  4214. std::string get_arch_name() const {
  4215. return arch_name;
  4216. }
  4217. enum llm_arch get_arch() const {
  4218. return llm_kv.arch;
  4219. }
  4220. const char * get_tensor_name(int i) const {
  4221. return weights.at(i).tensor->name;
  4222. }
  4223. const llama_tensor_weight * get_weight(const char * name) const {
  4224. for (const auto & weight : weights) {
  4225. if (strcmp(name, weight.tensor->name) == 0) {
  4226. return &weight;
  4227. }
  4228. }
  4229. return nullptr;
  4230. }
  4231. const llama_tensor_weight * get_weight(int i) const {
  4232. return get_weight(get_tensor_name(i));
  4233. }
  4234. const llama_tensor_weight & require_weight(const char * name) const {
  4235. const llama_tensor_weight * weight = get_weight(name);
  4236. if (!weight) {
  4237. throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
  4238. }
  4239. return *weight;
  4240. }
  4241. struct ggml_tensor * get_tensor_meta(const char * name) const {
  4242. const auto * weight = get_weight(name);
  4243. if (!weight) {
  4244. return nullptr;
  4245. }
  4246. return weight->tensor;
  4247. }
  4248. struct ggml_tensor * require_tensor_meta(const char * name) const {
  4249. struct ggml_tensor * tensor = get_tensor_meta(name);
  4250. if (!tensor) {
  4251. throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name));
  4252. }
  4253. return tensor;
  4254. }
  4255. struct ggml_tensor * get_tensor_meta(int i) const {
  4256. return get_tensor_meta(get_tensor_name(i));
  4257. }
  4258. struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
  4259. struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
  4260. ggml_set_name(tensor, ggml_get_name(cur));
  4261. if (duplicated) {
  4262. size_data += ggml_nbytes(cur);
  4263. } else {
  4264. n_created++;
  4265. }
  4266. return tensor;
  4267. }
  4268. const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector<int64_t> & ne, bool required) const {
  4269. const struct ggml_tensor * cur = get_tensor_meta(name.c_str());
  4270. if (cur == NULL) {
  4271. if (!required) {
  4272. return NULL;
  4273. }
  4274. throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
  4275. }
  4276. {
  4277. bool is_ok = true;
  4278. for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
  4279. if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) {
  4280. is_ok = false;
  4281. break;
  4282. }
  4283. }
  4284. if (!is_ok) {
  4285. throw std::runtime_error(
  4286. format("%s: tensor '%s' has wrong shape; expected %s, got %s",
  4287. __func__, name.c_str(),
  4288. llama_format_tensor_shape(ne).c_str(),
  4289. llama_format_tensor_shape(cur).c_str()));
  4290. }
  4291. }
  4292. return cur;
  4293. }
  4294. static const int TENSOR_NOT_REQUIRED = 1;
  4295. static const int TENSOR_DUPLICATED = 2;
  4296. struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
  4297. const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
  4298. if (cur == NULL) {
  4299. return NULL;
  4300. }
  4301. return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
  4302. }
  4303. struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
  4304. const struct ggml_tensor * cur = check_tensor_dims(name, ne, required);
  4305. if (cur == NULL) {
  4306. return NULL;
  4307. }
  4308. if (cur->type != base->type) {
  4309. throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type)));
  4310. }
  4311. std::array<int64_t, GGML_MAX_DIMS> dims;
  4312. for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
  4313. dims[i] = i < ne.size() ? ne[i] : 1;
  4314. }
  4315. struct ggml_tensor * tensor = ggml_view_4d(ctx, base,
  4316. dims[0], dims[1], dims[2], dims[3],
  4317. cur->nb[1], cur->nb[2], cur->nb[3],
  4318. offset);
  4319. ggml_set_name(tensor, name.c_str());
  4320. n_created++;
  4321. return tensor;
  4322. }
  4323. void done_getting_tensors() const {
  4324. if (n_created != n_tensors) {
  4325. throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
  4326. }
  4327. }
  4328. void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) {
  4329. if (use_mmap) {
  4330. mappings.reserve(files.size());
  4331. mmaps_used.reserve(files.size());
  4332. for (const auto & file : files) {
  4333. std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa()));
  4334. mmaps_used.emplace_back(mapping->size, 0);
  4335. if (mlock_mmaps) {
  4336. std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
  4337. mlock_mmap->init(mapping->addr);
  4338. mlock_mmaps->emplace_back(std::move(mlock_mmap));
  4339. }
  4340. mappings.emplace_back(std::move(mapping));
  4341. }
  4342. }
  4343. // compute the total size of all tensors for progress reporting
  4344. for (auto & w : weights) {
  4345. size_data += ggml_nbytes(w.tensor);
  4346. }
  4347. }
  4348. void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const {
  4349. GGML_ASSERT(!mappings.empty());
  4350. const auto & mapping = mappings.at(idx);
  4351. *first = mapping->size;
  4352. *last = 0;
  4353. *addr = mapping->addr;
  4354. for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
  4355. try {
  4356. const auto * weight = get_weight(ggml_get_name(tensor));
  4357. if (!weight) {
  4358. continue;
  4359. }
  4360. if (weight->idx != idx) {
  4361. continue;
  4362. }
  4363. *first = std::min(*first, weight->offs);
  4364. *last = std::max(*last, weight->offs + ggml_nbytes(tensor));
  4365. } catch(...) {
  4366. // the tensor is not in the model
  4367. }
  4368. }
  4369. }
  4370. // for backwards compatibility, does not support ggml-backend
  4371. void load_data_for(struct ggml_tensor * cur) const {
  4372. const auto & w = require_weight(ggml_get_name(cur));
  4373. if (use_mmap) {
  4374. const auto & mapping = mappings.at(w.idx);
  4375. if (cur->data == nullptr) {
  4376. cur->data = (uint8_t *)mapping->addr + w.offs;
  4377. } else {
  4378. memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur));
  4379. }
  4380. } else {
  4381. GGML_ASSERT(cur->data != nullptr);
  4382. GGML_ASSERT(w.idx < files.size());
  4383. const auto & file = files.at(w.idx);
  4384. file->seek(w.offs, SEEK_SET);
  4385. file->read_raw(cur->data, ggml_nbytes(cur));
  4386. }
  4387. if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
  4388. throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
  4389. }
  4390. }
  4391. size_t size_done = 0;
  4392. size_t size_data = 0;
  4393. std::vector<std::pair<size_t, size_t>> mmaps_used;
  4394. // Returns false if cancelled by progress_callback
  4395. bool load_all_data(
  4396. struct ggml_context * ctx,
  4397. llama_buf_map & bufs_mmap,
  4398. llama_mlocks * lmlocks,
  4399. llama_progress_callback progress_callback,
  4400. void * progress_callback_user_data) {
  4401. GGML_ASSERT(size_data != 0 && "call init_mappings() first");
  4402. std::vector<no_init<uint8_t>> read_buf;
  4403. std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
  4404. #if defined(GGML_USE_CUDA)
  4405. // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
  4406. // NVMe raid configurations might require more / larger buffers.
  4407. constexpr size_t n_buffers = 4;
  4408. constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
  4409. std::vector<ggml_backend_buffer_t> host_buffers;
  4410. std::vector<void*> host_ptrs;
  4411. std::vector<ggml_backend_event_t> events;
  4412. size_t buffer_idx = 0; // buffer to use for async loads
  4413. ggml_backend_t cuda_backend = nullptr;
  4414. if (!use_mmap && !check_tensors) {
  4415. // When not using mmaped io use async uploads from pinned memory to GPU memory.
  4416. // First determine if the CUDA backend is active, and if so, determine the device ID.
  4417. ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
  4418. if (buf) {
  4419. ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
  4420. for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
  4421. auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
  4422. if (buffer_type == cuda_buffer_type) {
  4423. cuda_backend = ggml_backend_cuda_init(i);
  4424. break;
  4425. }
  4426. }
  4427. }
  4428. // If the cuda backend is active create pinned memory buffers and events for synchronisation.
  4429. if (cuda_backend) {
  4430. for (size_t idx = 0; idx < n_buffers; ++idx) {
  4431. host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
  4432. host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
  4433. events.emplace_back(ggml_backend_event_new(cuda_backend));
  4434. }
  4435. }
  4436. }
  4437. #endif
  4438. for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
  4439. const auto * weight = get_weight(ggml_get_name(cur));
  4440. if (weight == nullptr) {
  4441. // this can happen with split experts models
  4442. continue;
  4443. }
  4444. if (progress_callback) {
  4445. if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
  4446. return false;
  4447. }
  4448. }
  4449. size_t n_size = ggml_nbytes(cur);
  4450. if (use_mmap) {
  4451. const auto & mapping = mappings.at(weight->idx);
  4452. ggml_backend_buffer_t buf_mmap = nullptr;
  4453. if (bufs_mmap.count(weight->idx)) {
  4454. buf_mmap = bufs_mmap.at(weight->idx);
  4455. }
  4456. uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
  4457. if (check_tensors) {
  4458. validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
  4459. return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
  4460. }));
  4461. }
  4462. GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
  4463. if (buf_mmap && cur->data == nullptr) {
  4464. ggml_backend_tensor_alloc(buf_mmap, cur, data);
  4465. if (lmlocks) {
  4466. const auto & lmlock = lmlocks->at(weight->idx);
  4467. lmlock->grow_to(weight->offs + n_size);
  4468. }
  4469. auto & mmap_used = mmaps_used[weight->idx];
  4470. mmap_used.first = std::min(mmap_used.first, weight->offs);
  4471. mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
  4472. } else {
  4473. ggml_backend_tensor_set(cur, data, 0, n_size);
  4474. }
  4475. } else {
  4476. GGML_ASSERT(weight->idx < files.size());
  4477. const auto & file = files.at(weight->idx);
  4478. if (ggml_backend_buffer_is_host(cur->buffer)) {
  4479. file->seek(weight->offs, SEEK_SET);
  4480. file->read_raw(cur->data, n_size);
  4481. if (check_tensors) {
  4482. validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
  4483. return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
  4484. }));
  4485. }
  4486. } else {
  4487. #if defined(GGML_USE_CUDA)
  4488. // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
  4489. if (cuda_backend) {
  4490. file->seek(weight->offs, SEEK_SET);
  4491. size_t bytes_read = 0;
  4492. while (bytes_read < n_size) {
  4493. size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
  4494. ggml_backend_event_synchronize(events[buffer_idx]);
  4495. file->read_raw(host_ptrs[buffer_idx], read_iteration);
  4496. ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
  4497. ggml_backend_event_record(events[buffer_idx]);
  4498. bytes_read += read_iteration;
  4499. ++buffer_idx;
  4500. buffer_idx %= n_buffers;
  4501. }
  4502. }
  4503. else
  4504. #endif
  4505. {
  4506. read_buf.resize(n_size);
  4507. file->seek(weight->offs, SEEK_SET);
  4508. file->read_raw(read_buf.data(), n_size);
  4509. ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
  4510. if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
  4511. throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
  4512. }
  4513. }
  4514. }
  4515. }
  4516. size_done += n_size;
  4517. }
  4518. #if defined(GGML_USE_CUDA)
  4519. // free temporary resources used for async cuda uploads
  4520. if (cuda_backend) {
  4521. for (size_t idx = 0; idx < n_buffers;++idx) {
  4522. ggml_backend_event_synchronize(events[idx]);
  4523. ggml_backend_event_free(events[idx]);
  4524. ggml_backend_buffer_free(host_buffers[idx]);
  4525. }
  4526. ggml_backend_free(cuda_backend);
  4527. }
  4528. #endif
  4529. // check validation results
  4530. bool validation_failed = false;
  4531. for (auto & future : validation_result) {
  4532. auto result = future.get();
  4533. if (!result.second) {
  4534. LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
  4535. validation_failed = true;
  4536. }
  4537. }
  4538. if (validation_failed) {
  4539. throw std::runtime_error("found tensors with invalid data");
  4540. }
  4541. // check if this is the last call and do final cleanup
  4542. if (size_done >= size_data) {
  4543. // unmap offloaded tensors and metadata
  4544. if (use_mmap) {
  4545. for (uint32_t idx = 0; idx < mappings.size(); idx++) {
  4546. const auto & mmap_used = mmaps_used.at(idx);
  4547. auto & mapping = mappings.at(idx);
  4548. mapping->unmap_fragment(0, mmap_used.first);
  4549. if (mmap_used.second != 0) {
  4550. mapping->unmap_fragment(mmap_used.second, mapping->size);
  4551. }
  4552. }
  4553. }
  4554. if (progress_callback) {
  4555. // Even though the model is done loading, we still honor
  4556. // cancellation since we need to free allocations.
  4557. return progress_callback(1.0f, progress_callback_user_data);
  4558. }
  4559. }
  4560. return true;
  4561. }
  4562. };
  4563. template<>
  4564. bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
  4565. uint32_t tmp;
  4566. const bool found = get_key(kid, tmp, required);
  4567. if (found) {
  4568. result = (enum llama_pooling_type) tmp;
  4569. } else {
  4570. result = LLAMA_POOLING_TYPE_UNSPECIFIED;
  4571. }
  4572. return found;
  4573. }
  4574. //
  4575. // load LLaMA models
  4576. //
  4577. static const char * llama_model_arch_name(llm_arch arch) {
  4578. auto it = LLM_ARCH_NAMES.find(arch);
  4579. if (it == LLM_ARCH_NAMES.end()) {
  4580. return "unknown";
  4581. }
  4582. return it->second;
  4583. }
  4584. static std::string llama_model_ftype_name(llama_ftype ftype) {
  4585. if (ftype & LLAMA_FTYPE_GUESSED) {
  4586. return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
  4587. }
  4588. switch (ftype) {
  4589. case LLAMA_FTYPE_ALL_F32: return "all F32";
  4590. case LLAMA_FTYPE_MOSTLY_F16: return "F16";
  4591. case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
  4592. case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
  4593. case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
  4594. case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
  4595. case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
  4596. case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  4597. case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
  4598. case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
  4599. case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
  4600. case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
  4601. case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
  4602. case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
  4603. case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
  4604. case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
  4605. case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
  4606. case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
  4607. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
  4608. case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
  4609. case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
  4610. case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
  4611. case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
  4612. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
  4613. case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
  4614. case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
  4615. case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
  4616. case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
  4617. case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
  4618. case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
  4619. case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
  4620. case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
  4621. case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
  4622. default: return "unknown, may not work";
  4623. }
  4624. }
  4625. static const char * llama_model_type_name(e_model type) {
  4626. switch (type) {
  4627. case MODEL_14M: return "14M";
  4628. case MODEL_17M: return "17M";
  4629. case MODEL_22M: return "22M";
  4630. case MODEL_33M: return "33M";
  4631. case MODEL_60M: return "60M";
  4632. case MODEL_70M: return "70M";
  4633. case MODEL_80M: return "80M";
  4634. case MODEL_109M: return "109M";
  4635. case MODEL_137M: return "137M";
  4636. case MODEL_160M: return "160M";
  4637. case MODEL_220M: return "220M";
  4638. case MODEL_250M: return "250M";
  4639. case MODEL_270M: return "270M";
  4640. case MODEL_335M: return "335M";
  4641. case MODEL_410M: return "410M";
  4642. case MODEL_450M: return "450M";
  4643. case MODEL_770M: return "770M";
  4644. case MODEL_780M: return "780M";
  4645. case MODEL_0_5B: return "0.5B";
  4646. case MODEL_1B: return "1B";
  4647. case MODEL_1_3B: return "1.3B";
  4648. case MODEL_1_4B: return "1.4B";
  4649. case MODEL_1_6B: return "1.6B";
  4650. case MODEL_2B: return "2B";
  4651. case MODEL_2_8B: return "2.8B";
  4652. case MODEL_3B: return "3B";
  4653. case MODEL_4B: return "4B";
  4654. case MODEL_6B: return "6B";
  4655. case MODEL_6_9B: return "6.9B";
  4656. case MODEL_7B: return "7B";
  4657. case MODEL_8B: return "8B";
  4658. case MODEL_9B: return "9B";
  4659. case MODEL_11B: return "11B";
  4660. case MODEL_12B: return "12B";
  4661. case MODEL_13B: return "13B";
  4662. case MODEL_14B: return "14B";
  4663. case MODEL_15B: return "15B";
  4664. case MODEL_16B: return "16B";
  4665. case MODEL_20B: return "20B";
  4666. case MODEL_30B: return "30B";
  4667. case MODEL_34B: return "34B";
  4668. case MODEL_35B: return "35B";
  4669. case MODEL_40B: return "40B";
  4670. case MODEL_65B: return "65B";
  4671. case MODEL_70B: return "70B";
  4672. case MODEL_236B: return "236B";
  4673. case MODEL_314B: return "314B";
  4674. case MODEL_SMALL: return "0.1B";
  4675. case MODEL_MEDIUM: return "0.4B";
  4676. case MODEL_LARGE: return "0.8B";
  4677. case MODEL_XL: return "1.5B";
  4678. case MODEL_A2_7B: return "A2.7B";
  4679. case MODEL_8x7B: return "8x7B";
  4680. case MODEL_8x22B: return "8x22B";
  4681. case MODEL_16x12B: return "16x12B";
  4682. case MODEL_10B_128x3_66B: return "10B+128x3.66B";
  4683. case MODEL_57B_A14B: return "57B.A14B";
  4684. case MODEL_27B: return "27B";
  4685. default: return "?B";
  4686. }
  4687. }
  4688. static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
  4689. switch (type) {
  4690. case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
  4691. case LLAMA_VOCAB_TYPE_SPM: return "SPM";
  4692. case LLAMA_VOCAB_TYPE_BPE: return "BPE";
  4693. case LLAMA_VOCAB_TYPE_WPM: return "WPM";
  4694. case LLAMA_VOCAB_TYPE_UGM: return "UGM";
  4695. case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
  4696. default: return "unknown";
  4697. }
  4698. }
  4699. static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
  4700. model.arch = ml.get_arch();
  4701. if (model.arch == LLM_ARCH_UNKNOWN) {
  4702. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  4703. }
  4704. }
  4705. static void llm_load_hparams(
  4706. llama_model_loader & ml,
  4707. llama_model & model) {
  4708. auto & hparams = model.hparams;
  4709. const gguf_context * ctx = ml.meta;
  4710. // get metadata as string
  4711. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  4712. enum gguf_type type = gguf_get_kv_type(ctx, i);
  4713. if (type == GGUF_TYPE_ARRAY) {
  4714. continue;
  4715. }
  4716. const char * name = gguf_get_key(ctx, i);
  4717. const std::string value = gguf_kv_to_str(ctx, i);
  4718. model.gguf_kv.emplace(name, value);
  4719. }
  4720. // get general kv
  4721. ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
  4722. // get hparams kv
  4723. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
  4724. // everything past this point is not vocab-related
  4725. if (hparams.vocab_only) {
  4726. return;
  4727. }
  4728. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  4729. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  4730. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  4731. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  4732. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  4733. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  4734. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  4735. if (hparams.n_expert > 0) {
  4736. GGML_ASSERT(hparams.n_expert_used > 0);
  4737. } else {
  4738. GGML_ASSERT(hparams.n_expert_used == 0);
  4739. }
  4740. // zero-out the per-layer hparams
  4741. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  4742. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  4743. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  4744. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
  4745. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
  4746. // n_head_kv is optional, default to n_head
  4747. hparams.n_head_kv_arr = hparams.n_head_arr;
  4748. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  4749. bool rope_finetuned = false;
  4750. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  4751. hparams.rope_finetuned = rope_finetuned;
  4752. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  4753. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  4754. // rope_freq_base (optional)
  4755. hparams.rope_freq_base_train = 10000.0f;
  4756. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  4757. std::string rope_scaling("linear");
  4758. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  4759. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  4760. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  4761. // rope_freq_scale (inverse of the kv) is optional
  4762. float ropescale = 0.0f;
  4763. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  4764. // try the old key name
  4765. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  4766. }
  4767. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  4768. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  4769. // non-transformer models do not have attention heads
  4770. if (hparams.n_head() > 0) {
  4771. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  4772. // gpt-j n_rot = rotary_dim
  4773. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  4774. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  4775. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  4776. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  4777. // sanity check for n_rot (optional)
  4778. hparams.n_rot = hparams.n_embd_head_k;
  4779. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  4780. if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
  4781. if (hparams.n_rot != hparams.n_embd_head_k) {
  4782. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  4783. }
  4784. }
  4785. } else {
  4786. hparams.n_rot = 0;
  4787. hparams.n_embd_head_k = 0;
  4788. hparams.n_embd_head_v = 0;
  4789. }
  4790. // arch-specific KVs
  4791. switch (model.arch) {
  4792. case LLM_ARCH_LLAMA:
  4793. {
  4794. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4795. if (hparams.n_expert == 8) {
  4796. switch (hparams.n_layer) {
  4797. case 32: model.type = e_model::MODEL_8x7B; break;
  4798. case 56: model.type = e_model::MODEL_8x22B; break;
  4799. default: model.type = e_model::MODEL_UNKNOWN;
  4800. }
  4801. } else {
  4802. switch (hparams.n_layer) {
  4803. case 22: model.type = e_model::MODEL_1B; break;
  4804. case 26: model.type = e_model::MODEL_3B; break;
  4805. // granite uses a vocab with len 49152
  4806. case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
  4807. case 36: model.type = e_model::MODEL_8B; break; // granite
  4808. case 40: model.type = e_model::MODEL_13B; break;
  4809. case 48: model.type = e_model::MODEL_34B; break;
  4810. case 60: model.type = e_model::MODEL_30B; break;
  4811. case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
  4812. default: model.type = e_model::MODEL_UNKNOWN;
  4813. }
  4814. }
  4815. } break;
  4816. case LLM_ARCH_MINICPM:
  4817. {
  4818. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4819. switch (hparams.n_layer) {
  4820. case 40: model.type = e_model::MODEL_2B; break;
  4821. default: model.type = e_model::MODEL_UNKNOWN;
  4822. }
  4823. } break;
  4824. case LLM_ARCH_GROK:
  4825. {
  4826. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4827. switch (hparams.n_layer) {
  4828. case 64: model.type = e_model::MODEL_314B; break;
  4829. default: model.type = e_model::MODEL_UNKNOWN;
  4830. }
  4831. } break;
  4832. case LLM_ARCH_FALCON:
  4833. {
  4834. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4835. switch (hparams.n_layer) {
  4836. case 32: model.type = e_model::MODEL_7B; break;
  4837. case 60: model.type = e_model::MODEL_40B; break;
  4838. default: model.type = e_model::MODEL_UNKNOWN;
  4839. }
  4840. } break;
  4841. case LLM_ARCH_BAICHUAN:
  4842. {
  4843. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4844. switch (hparams.n_layer) {
  4845. case 32: model.type = e_model::MODEL_7B; break;
  4846. case 40: model.type = e_model::MODEL_13B; break;
  4847. default: model.type = e_model::MODEL_UNKNOWN;
  4848. }
  4849. if (model.type == e_model::MODEL_13B) {
  4850. // TODO: become GGUF KV parameter
  4851. hparams.f_max_alibi_bias = 8.0f;
  4852. }
  4853. } break;
  4854. case LLM_ARCH_STARCODER:
  4855. {
  4856. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4857. switch (hparams.n_layer) {
  4858. case 24: model.type = e_model::MODEL_1B; break;
  4859. case 36: model.type = e_model::MODEL_3B; break;
  4860. case 42: model.type = e_model::MODEL_7B; break;
  4861. case 40: model.type = e_model::MODEL_15B; break;
  4862. default: model.type = e_model::MODEL_UNKNOWN;
  4863. }
  4864. } break;
  4865. case LLM_ARCH_REFACT:
  4866. {
  4867. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4868. switch (hparams.n_layer) {
  4869. case 32: model.type = e_model::MODEL_1B; break;
  4870. default: model.type = e_model::MODEL_UNKNOWN;
  4871. }
  4872. // TODO: become GGUF KV parameter
  4873. hparams.f_max_alibi_bias = 8.0f;
  4874. } break;
  4875. case LLM_ARCH_BERT:
  4876. {
  4877. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4878. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  4879. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  4880. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  4881. switch (hparams.n_layer) {
  4882. case 3:
  4883. model.type = e_model::MODEL_17M; break; // bge-micro
  4884. case 6:
  4885. model.type = e_model::MODEL_22M; break; // MiniLM-L6
  4886. case 12:
  4887. switch (hparams.n_embd) {
  4888. case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
  4889. case 768: model.type = e_model::MODEL_109M; break; // bge-base
  4890. } break;
  4891. case 24:
  4892. model.type = e_model::MODEL_335M; break; // bge-large
  4893. }
  4894. } break;
  4895. case LLM_ARCH_JINA_BERT_V2:
  4896. {
  4897. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4898. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  4899. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  4900. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  4901. hparams.f_max_alibi_bias = 8.0f;
  4902. switch (hparams.n_layer) {
  4903. case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
  4904. case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
  4905. }
  4906. } break;
  4907. case LLM_ARCH_NOMIC_BERT:
  4908. {
  4909. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4910. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  4911. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  4912. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  4913. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  4914. model.type = e_model::MODEL_137M;
  4915. }
  4916. } break;
  4917. case LLM_ARCH_BLOOM:
  4918. {
  4919. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4920. switch (hparams.n_layer) {
  4921. case 24: model.type = e_model::MODEL_1B; break;
  4922. case 30:
  4923. switch (hparams.n_embd) {
  4924. case 2560: model.type = e_model::MODEL_3B; break;
  4925. case 4096: model.type = e_model::MODEL_7B; break;
  4926. } break;
  4927. }
  4928. // TODO: become GGUF KV parameter
  4929. hparams.f_max_alibi_bias = 8.0f;
  4930. } break;
  4931. case LLM_ARCH_MPT:
  4932. {
  4933. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4934. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  4935. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  4936. switch (hparams.n_layer) {
  4937. case 32: model.type = e_model::MODEL_7B; break;
  4938. case 48: model.type = e_model::MODEL_30B; break;
  4939. default: model.type = e_model::MODEL_UNKNOWN;
  4940. }
  4941. } break;
  4942. case LLM_ARCH_STABLELM:
  4943. {
  4944. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4945. switch (hparams.n_layer) {
  4946. case 24: model.type = e_model::MODEL_1B; break;
  4947. case 32: model.type = e_model::MODEL_3B; break;
  4948. case 40: model.type = e_model::MODEL_12B; break;
  4949. default: model.type = e_model::MODEL_UNKNOWN;
  4950. }
  4951. } break;
  4952. case LLM_ARCH_QWEN:
  4953. {
  4954. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4955. switch (hparams.n_layer) {
  4956. case 32: model.type = e_model::MODEL_7B; break;
  4957. case 40: model.type = e_model::MODEL_13B; break;
  4958. default: model.type = e_model::MODEL_UNKNOWN;
  4959. }
  4960. } break;
  4961. case LLM_ARCH_QWEN2:
  4962. {
  4963. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4964. switch (hparams.n_layer) {
  4965. case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
  4966. case 32: model.type = e_model::MODEL_7B; break;
  4967. case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
  4968. case 80: model.type = e_model::MODEL_70B; break;
  4969. default: model.type = e_model::MODEL_UNKNOWN;
  4970. }
  4971. } break;
  4972. case LLM_ARCH_QWEN2MOE:
  4973. {
  4974. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  4975. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  4976. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4977. switch (hparams.n_layer) {
  4978. case 24: model.type = e_model::MODEL_A2_7B; break;
  4979. case 28: model.type = e_model::MODEL_57B_A14B; break;
  4980. default: model.type = e_model::MODEL_UNKNOWN;
  4981. }
  4982. } break;
  4983. case LLM_ARCH_PHI2:
  4984. {
  4985. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  4986. switch (hparams.n_layer) {
  4987. case 24: model.type = e_model::MODEL_1B; break;
  4988. case 32: model.type = e_model::MODEL_3B; break;
  4989. default: model.type = e_model::MODEL_UNKNOWN;
  4990. }
  4991. } break;
  4992. case LLM_ARCH_PHI3:
  4993. {
  4994. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  4995. switch (hparams.n_layer) {
  4996. case 24: model.type = e_model::MODEL_1B; break;
  4997. case 32: model.type = e_model::MODEL_3B; break;
  4998. case 40: model.type = e_model::MODEL_14B; break;
  4999. default: model.type = e_model::MODEL_UNKNOWN;
  5000. }
  5001. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  5002. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  5003. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  5004. hparams.n_swa = 2047;
  5005. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  5006. // default value for Phi-3-mini-128k-instruct
  5007. hparams.n_swa = 262144;
  5008. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  5009. // default value for Phi-3-medium-128k-instruct
  5010. hparams.n_swa = 131072;
  5011. }
  5012. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  5013. if (!found_swa && hparams.n_swa == 0) {
  5014. throw std::runtime_error("invalid value for sliding_window");
  5015. }
  5016. } break;
  5017. case LLM_ARCH_PLAMO:
  5018. {
  5019. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5020. switch (hparams.n_layer) {
  5021. case 40: model.type = e_model::MODEL_13B; break;
  5022. default: model.type = e_model::MODEL_UNKNOWN;
  5023. }
  5024. } break;
  5025. case LLM_ARCH_GPT2:
  5026. {
  5027. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5028. switch (hparams.n_layer) {
  5029. case 12: model.type = e_model::MODEL_SMALL; break;
  5030. case 24: model.type = e_model::MODEL_MEDIUM; break;
  5031. case 36: model.type = e_model::MODEL_LARGE; break;
  5032. case 48: model.type = e_model::MODEL_XL; break;
  5033. default: model.type = e_model::MODEL_UNKNOWN;
  5034. }
  5035. } break;
  5036. case LLM_ARCH_CODESHELL:
  5037. {
  5038. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5039. switch (hparams.n_layer) {
  5040. case 42: model.type = e_model::MODEL_7B; break;
  5041. default: model.type = e_model::MODEL_UNKNOWN;
  5042. }
  5043. } break;
  5044. case LLM_ARCH_ORION:
  5045. {
  5046. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5047. switch (hparams.n_layer) {
  5048. case 40: model.type = e_model::MODEL_14B; break;
  5049. default: model.type = e_model::MODEL_UNKNOWN;
  5050. }
  5051. } break;
  5052. case LLM_ARCH_INTERNLM2:
  5053. {
  5054. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5055. switch (hparams.n_layer) {
  5056. case 32: model.type = e_model::MODEL_7B; break;
  5057. case 48: model.type = e_model::MODEL_20B; break;
  5058. default: model.type = e_model::MODEL_UNKNOWN;
  5059. }
  5060. } break;
  5061. case LLM_ARCH_GEMMA:
  5062. {
  5063. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5064. switch (hparams.n_layer) {
  5065. case 18: model.type = e_model::MODEL_2B; break;
  5066. case 28: model.type = e_model::MODEL_7B; break;
  5067. default: model.type = e_model::MODEL_UNKNOWN;
  5068. }
  5069. } break;
  5070. case LLM_ARCH_GEMMA2:
  5071. {
  5072. hparams.n_swa = 4096; // default value of gemma 2
  5073. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  5074. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5075. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  5076. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  5077. hparams.attn_soft_cap = true;
  5078. switch (hparams.n_layer) {
  5079. case 26: model.type = e_model::MODEL_2B; break;
  5080. case 42: model.type = e_model::MODEL_9B; break;
  5081. case 46: model.type = e_model::MODEL_27B; break;
  5082. default: model.type = e_model::MODEL_UNKNOWN;
  5083. }
  5084. } break;
  5085. case LLM_ARCH_STARCODER2:
  5086. {
  5087. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5088. switch (hparams.n_layer) {
  5089. case 30: model.type = e_model::MODEL_3B; break;
  5090. case 32: model.type = e_model::MODEL_7B; break;
  5091. case 40: model.type = e_model::MODEL_15B; break;
  5092. case 52: model.type = e_model::MODEL_20B; break; // granite
  5093. case 88: model.type = e_model::MODEL_34B; break; // granite
  5094. default: model.type = e_model::MODEL_UNKNOWN;
  5095. }
  5096. } break;
  5097. case LLM_ARCH_MAMBA:
  5098. {
  5099. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  5100. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  5101. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  5102. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  5103. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  5104. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5105. switch (hparams.n_layer) {
  5106. case 24:
  5107. switch (hparams.n_embd) {
  5108. case 768: model.type = e_model::MODEL_SMALL; break;
  5109. default: model.type = e_model::MODEL_UNKNOWN;
  5110. } break;
  5111. case 48:
  5112. switch (hparams.n_embd) {
  5113. case 1024: model.type = e_model::MODEL_MEDIUM; break;
  5114. case 1536: model.type = e_model::MODEL_LARGE; break;
  5115. case 2048: model.type = e_model::MODEL_XL; break;
  5116. default: model.type = e_model::MODEL_UNKNOWN;
  5117. } break;
  5118. case 64:
  5119. switch (hparams.n_embd) {
  5120. case 2560: model.type = e_model::MODEL_3B; break;
  5121. default: model.type = e_model::MODEL_UNKNOWN;
  5122. } break;
  5123. default: model.type = e_model::MODEL_UNKNOWN;
  5124. }
  5125. } break;
  5126. case LLM_ARCH_XVERSE:
  5127. {
  5128. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5129. switch (hparams.n_layer) {
  5130. case 32: model.type = e_model::MODEL_7B; break;
  5131. case 40: model.type = e_model::MODEL_13B; break;
  5132. case 80: model.type = e_model::MODEL_65B; break;
  5133. default: model.type = e_model::MODEL_UNKNOWN;
  5134. }
  5135. } break;
  5136. case LLM_ARCH_COMMAND_R:
  5137. {
  5138. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  5139. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5140. switch (hparams.n_layer) {
  5141. case 40: model.type = e_model::MODEL_35B; break;
  5142. default: model.type = e_model::MODEL_UNKNOWN;
  5143. }
  5144. } break;
  5145. case LLM_ARCH_DBRX:
  5146. {
  5147. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5148. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  5149. switch (hparams.n_layer) {
  5150. case 40: model.type = e_model::MODEL_16x12B; break;
  5151. default: model.type = e_model::MODEL_UNKNOWN;
  5152. }
  5153. } break;
  5154. case LLM_ARCH_OLMO:
  5155. {
  5156. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5157. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  5158. switch (hparams.n_layer) {
  5159. case 22: model.type = e_model::MODEL_1B; break;
  5160. case 32: model.type = e_model::MODEL_7B; break;
  5161. case 80: model.type = e_model::MODEL_70B; break;
  5162. default: model.type = e_model::MODEL_UNKNOWN;
  5163. }
  5164. } break;
  5165. case LLM_ARCH_OPENELM:
  5166. {
  5167. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5168. switch (hparams.n_layer) {
  5169. case 16: model.type = e_model::MODEL_270M; break;
  5170. case 20: model.type = e_model::MODEL_450M; break;
  5171. case 28: model.type = e_model::MODEL_1B; break;
  5172. case 36: model.type = e_model::MODEL_3B; break;
  5173. default: model.type = e_model::MODEL_UNKNOWN;
  5174. }
  5175. } break;
  5176. case LLM_ARCH_GPTNEOX:
  5177. {
  5178. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5179. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  5180. switch (hparams.n_layer) {
  5181. case 6:
  5182. switch (hparams.n_ff()) {
  5183. case 512: model.type = e_model::MODEL_14M; break;
  5184. case 2048: model.type = e_model::MODEL_70M; break;
  5185. default: model.type = e_model::MODEL_UNKNOWN;
  5186. } break;
  5187. case 12:
  5188. switch (hparams.n_ff()) {
  5189. case 3072: model.type = e_model::MODEL_160M; break;
  5190. default: model.type = e_model::MODEL_UNKNOWN;
  5191. } break;
  5192. case 16:
  5193. switch (hparams.n_ff()) {
  5194. case 8192: model.type = e_model::MODEL_1B; break;
  5195. default: model.type = e_model::MODEL_UNKNOWN;
  5196. } break;
  5197. case 24:
  5198. switch (hparams.n_ff()) {
  5199. case 4096: model.type = e_model::MODEL_410M; break;
  5200. case 8192: model.type = e_model::MODEL_1_4B; break;
  5201. default: model.type = e_model::MODEL_UNKNOWN;
  5202. } break;
  5203. case 32:
  5204. switch (hparams.n_ff()) {
  5205. case 10240: model.type = e_model::MODEL_2_8B; break;
  5206. case 16384: model.type = e_model::MODEL_6_9B; break;
  5207. default: model.type = e_model::MODEL_UNKNOWN;
  5208. } break;
  5209. case 36:
  5210. switch (hparams.n_ff()) {
  5211. case 20480: model.type = e_model::MODEL_12B; break;
  5212. default: model.type = e_model::MODEL_UNKNOWN;
  5213. } break;
  5214. case 44:
  5215. switch (hparams.n_ff()) {
  5216. case 24576: model.type = e_model::MODEL_20B; break;
  5217. default: model.type = e_model::MODEL_UNKNOWN;
  5218. } break;
  5219. default: model.type = e_model::MODEL_UNKNOWN;
  5220. }
  5221. } break;
  5222. case LLM_ARCH_ARCTIC:
  5223. {
  5224. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5225. if (hparams.n_expert == 128) {
  5226. switch (hparams.n_layer) {
  5227. case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
  5228. default: model.type = e_model::MODEL_UNKNOWN;
  5229. }
  5230. } else {
  5231. model.type = e_model::MODEL_UNKNOWN;
  5232. }
  5233. } break;
  5234. case LLM_ARCH_DEEPSEEK2:
  5235. {
  5236. bool is_lite = (hparams.n_layer == 27);
  5237. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5238. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  5239. if (!is_lite) {
  5240. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  5241. }
  5242. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  5243. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  5244. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  5245. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  5246. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  5247. switch (hparams.n_layer) {
  5248. case 27: model.type = e_model::MODEL_16B; break;
  5249. case 60: model.type = e_model::MODEL_236B; break;
  5250. default: model.type = e_model::MODEL_UNKNOWN;
  5251. }
  5252. } break;
  5253. case LLM_ARCH_CHATGLM:
  5254. {
  5255. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5256. switch (hparams.n_layer) {
  5257. case 28: model.type = e_model::MODEL_6B; break;
  5258. case 40: model.type = e_model::MODEL_9B; break;
  5259. default: model.type = e_model::MODEL_UNKNOWN;
  5260. }
  5261. } break;
  5262. case LLM_ARCH_BITNET:
  5263. {
  5264. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5265. switch (hparams.n_layer) {
  5266. case 26: model.type = e_model::MODEL_3B; break;
  5267. default: model.type = e_model::MODEL_UNKNOWN;
  5268. }
  5269. } break;
  5270. case LLM_ARCH_T5:
  5271. {
  5272. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5273. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  5274. uint32_t dec_start_token_id;
  5275. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  5276. hparams.dec_start_token_id = dec_start_token_id;
  5277. }
  5278. switch (hparams.n_layer) {
  5279. case 6: model.type = e_model::MODEL_60M; break; // t5-small
  5280. case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
  5281. case 12:
  5282. switch (hparams.n_ff()) {
  5283. case 3072: model.type = e_model::MODEL_220M; break; // t5-base
  5284. case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
  5285. default: model.type = e_model::MODEL_UNKNOWN;
  5286. } break;
  5287. case 24:
  5288. switch (hparams.n_ff()) {
  5289. case 4096: model.type = e_model::MODEL_770M; break; // t5-large
  5290. case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
  5291. case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
  5292. case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
  5293. case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
  5294. case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
  5295. default: model.type = e_model::MODEL_UNKNOWN;
  5296. } break;
  5297. default: model.type = e_model::MODEL_UNKNOWN;
  5298. }
  5299. } break;
  5300. case LLM_ARCH_T5ENCODER:
  5301. {
  5302. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5303. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  5304. model.type = e_model::MODEL_UNKNOWN;
  5305. } break;
  5306. case LLM_ARCH_JAIS:
  5307. {
  5308. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5309. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  5310. switch (hparams.n_layer) {
  5311. case 24: model.type = e_model::MODEL_1_3B; break;
  5312. case 40: model.type = e_model::MODEL_13B; break;
  5313. /* TODO: add variants */
  5314. default: model.type = e_model::MODEL_UNKNOWN;
  5315. }
  5316. } break;
  5317. case LLM_ARCH_NEMOTRON:
  5318. {
  5319. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5320. switch (hparams.n_layer) {
  5321. case 32: model.type = e_model::MODEL_4B; break;
  5322. default: model.type = e_model::MODEL_UNKNOWN;
  5323. }
  5324. } break;
  5325. case LLM_ARCH_EXAONE:
  5326. {
  5327. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5328. switch (hparams.n_layer) {
  5329. case 32: model.type = e_model::MODEL_8B; break;
  5330. default: model.type = e_model::MODEL_UNKNOWN;
  5331. }
  5332. } break;
  5333. case LLM_ARCH_RWKV6:
  5334. {
  5335. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  5336. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  5337. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  5338. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  5339. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  5340. switch (hparams.n_layer) {
  5341. case 24: model.type = e_model::MODEL_1_6B; break;
  5342. case 32:
  5343. switch (hparams.n_embd) {
  5344. case 2560: model.type = e_model::MODEL_3B; break;
  5345. case 4096: model.type = e_model::MODEL_7B; break;
  5346. default: model.type = e_model::MODEL_UNKNOWN;
  5347. } break;
  5348. case 61: model.type = e_model::MODEL_14B; break;
  5349. default: model.type = e_model::MODEL_UNKNOWN;
  5350. }
  5351. } break;
  5352. case LLM_ARCH_SOLAR:
  5353. {
  5354. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  5355. for (int i = 0; i < hparams.n_bskcn_arr.max_size(); ++i) {
  5356. auto & bskcn = hparams.n_bskcn_arr.at(i);
  5357. bskcn.fill(0);
  5358. ml.get_key_or_arr(::format(LLM_KV_NAMES.at(LLM_KV_ATTENTION_BLOCK_SKIP_CONNECTION), LLM_ARCH_NAMES.at(ml.llm_kv.arch), i), bskcn, hparams.n_layer, false);
  5359. }
  5360. switch (hparams.n_layer) {
  5361. case 64: model.type = e_model::MODEL_22B; break;
  5362. default: model.type = e_model::MODEL_UNKNOWN;
  5363. }
  5364. }
  5365. default: (void)0;
  5366. }
  5367. model.ftype = ml.ftype;
  5368. if (hparams.f_max_alibi_bias > 0.0f) {
  5369. hparams.use_alibi = true;
  5370. }
  5371. hparams.rope_type = llama_rope_type(&model);
  5372. }
  5373. static void llm_load_vocab(
  5374. llama_model_loader & ml,
  5375. llama_model & model) {
  5376. auto & vocab = model.vocab;
  5377. struct gguf_context * ctx = ml.meta;
  5378. const auto kv = LLM_KV(model.arch);
  5379. // determine vocab type
  5380. {
  5381. std::string tokenizer_model;
  5382. std::string tokenizer_pre;
  5383. ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
  5384. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  5385. if (tokenizer_model == "no_vocab") {
  5386. vocab.type = LLAMA_VOCAB_TYPE_NONE;
  5387. // default special tokens
  5388. vocab.special_bos_id = -1;
  5389. vocab.special_eos_id = -1;
  5390. vocab.special_unk_id = -1;
  5391. vocab.special_sep_id = -1;
  5392. vocab.special_pad_id = -1;
  5393. vocab.special_cls_id = -1;
  5394. vocab.special_mask_id = -1;
  5395. vocab.linefeed_id = -1;
  5396. return;
  5397. } else if (tokenizer_model == "llama") {
  5398. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  5399. // default special tokens
  5400. vocab.special_bos_id = 1;
  5401. vocab.special_eos_id = 2;
  5402. vocab.special_unk_id = 0;
  5403. vocab.special_sep_id = -1;
  5404. vocab.special_pad_id = -1;
  5405. vocab.special_cls_id = -1;
  5406. vocab.special_mask_id = -1;
  5407. } else if (tokenizer_model == "bert") {
  5408. vocab.type = LLAMA_VOCAB_TYPE_WPM;
  5409. // default special tokens
  5410. vocab.special_bos_id = -1;
  5411. vocab.special_eos_id = -1;
  5412. vocab.special_unk_id = 100;
  5413. vocab.special_sep_id = 102;
  5414. vocab.special_pad_id = 0;
  5415. vocab.special_cls_id = 101;
  5416. vocab.special_mask_id = 103;
  5417. } else if (tokenizer_model == "gpt2") {
  5418. vocab.type = LLAMA_VOCAB_TYPE_BPE;
  5419. // read bpe merges and populate bpe ranks
  5420. const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
  5421. if (merges_keyidx == -1) {
  5422. throw std::runtime_error("cannot find tokenizer merges in model file\n");
  5423. }
  5424. const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
  5425. for (int i = 0; i < n_merges; i++) {
  5426. const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
  5427. GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
  5428. std::string first;
  5429. std::string second;
  5430. const size_t pos = word.find(' ', 1);
  5431. if (pos != std::string::npos) {
  5432. first = word.substr(0, pos);
  5433. second = word.substr(pos + 1);
  5434. }
  5435. vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
  5436. }
  5437. // default special tokens
  5438. vocab.special_bos_id = 11;
  5439. vocab.special_eos_id = 11;
  5440. vocab.special_unk_id = -1;
  5441. vocab.special_sep_id = -1;
  5442. vocab.special_pad_id = -1;
  5443. vocab.special_cls_id = -1;
  5444. vocab.special_mask_id = -1;
  5445. } else if (tokenizer_model == "t5") {
  5446. vocab.type = LLAMA_VOCAB_TYPE_UGM;
  5447. // default special tokens
  5448. vocab.special_bos_id = -1;
  5449. vocab.special_eos_id = 1;
  5450. vocab.special_unk_id = 2;
  5451. vocab.special_sep_id = -1;
  5452. vocab.special_pad_id = 0;
  5453. vocab.special_cls_id = -1;
  5454. vocab.special_mask_id = -1;
  5455. const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
  5456. if (precompiled_charsmap_keyidx != -1) {
  5457. size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
  5458. const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
  5459. vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
  5460. #ifdef IS_BIG_ENDIAN
  5461. // correct endiannes of data in precompiled_charsmap binary blob
  5462. uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
  5463. *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
  5464. assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
  5465. size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
  5466. uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
  5467. for (size_t i = 0; i < xcda_array_size; ++i) {
  5468. xcda_array[i] = __builtin_bswap32(xcda_array[i]);
  5469. }
  5470. #endif
  5471. }
  5472. } else if (tokenizer_model == "rwkv") {
  5473. vocab.type = LLAMA_VOCAB_TYPE_RWKV;
  5474. // default special tokens
  5475. vocab.special_bos_id = -1;
  5476. vocab.special_eos_id = -1;
  5477. vocab.special_unk_id = -1;
  5478. vocab.special_sep_id = -1;
  5479. vocab.special_pad_id = -1;
  5480. } else {
  5481. throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
  5482. }
  5483. // for now, only BPE models have pre-tokenizers
  5484. if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
  5485. vocab.tokenizer_add_space_prefix = false;
  5486. vocab.tokenizer_clean_spaces = true;
  5487. if (tokenizer_pre == "default") {
  5488. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5489. } else if (
  5490. tokenizer_pre == "llama3" ||
  5491. tokenizer_pre == "llama-v3" ||
  5492. tokenizer_pre == "llama-bpe") {
  5493. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
  5494. vocab.tokenizer_ignore_merges = true;
  5495. vocab.tokenizer_add_bos = true;
  5496. } else if (
  5497. tokenizer_pre == "deepseek-llm") {
  5498. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
  5499. vocab.tokenizer_clean_spaces = false;
  5500. } else if (
  5501. tokenizer_pre == "deepseek-coder") {
  5502. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
  5503. vocab.tokenizer_clean_spaces = false;
  5504. } else if (
  5505. tokenizer_pre == "falcon") {
  5506. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
  5507. } else if (
  5508. tokenizer_pre == "mpt") {
  5509. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
  5510. } else if (
  5511. tokenizer_pre == "starcoder") {
  5512. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
  5513. } else if (
  5514. tokenizer_pre == "gpt-2" ||
  5515. tokenizer_pre == "phi-2" ||
  5516. tokenizer_pre == "jina-es" ||
  5517. tokenizer_pre == "jina-de" ||
  5518. tokenizer_pre == "jina-v2-es" ||
  5519. tokenizer_pre == "jina-v2-de" ||
  5520. tokenizer_pre == "jina-v2-code") {
  5521. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
  5522. } else if (
  5523. tokenizer_pre == "refact") {
  5524. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
  5525. } else if (
  5526. tokenizer_pre == "command-r") {
  5527. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
  5528. vocab.tokenizer_clean_spaces = false;
  5529. } else if (
  5530. tokenizer_pre == "qwen2") {
  5531. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  5532. vocab.tokenizer_clean_spaces = false;
  5533. } else if (
  5534. tokenizer_pre == "stablelm2") {
  5535. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
  5536. } else if (
  5537. tokenizer_pre == "olmo") {
  5538. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
  5539. } else if (
  5540. tokenizer_pre == "dbrx") {
  5541. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
  5542. } else if (
  5543. tokenizer_pre == "smaug-bpe") {
  5544. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
  5545. } else if (
  5546. tokenizer_pre == "poro-chat") {
  5547. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
  5548. vocab.tokenizer_clean_spaces = false;
  5549. } else if (
  5550. tokenizer_pre == "chatglm-bpe") {
  5551. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
  5552. vocab.special_bos_id = -1;
  5553. } else if (
  5554. tokenizer_pre == "viking") {
  5555. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
  5556. vocab.tokenizer_clean_spaces = false;
  5557. } else if (
  5558. tokenizer_pre == "jais") {
  5559. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
  5560. } else if (
  5561. tokenizer_pre == "tekken") {
  5562. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
  5563. vocab.tokenizer_clean_spaces = false;
  5564. vocab.tokenizer_ignore_merges = true;
  5565. vocab.tokenizer_add_bos = true;
  5566. } else if (
  5567. tokenizer_pre == "smollm") {
  5568. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
  5569. vocab.tokenizer_clean_spaces = false;
  5570. } else if (
  5571. tokenizer_pre == "codeshell") {
  5572. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
  5573. } else if (
  5574. tokenizer_pre == "bloom") {
  5575. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
  5576. } else if (
  5577. tokenizer_pre == "gpt3-finnish") {
  5578. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
  5579. } else if (
  5580. tokenizer_pre == "exaone") {
  5581. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
  5582. } else {
  5583. LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
  5584. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5585. }
  5586. } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  5587. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5588. vocab.tokenizer_add_space_prefix = true;
  5589. vocab.tokenizer_clean_spaces = false;
  5590. vocab.tokenizer_add_bos = true;
  5591. vocab.tokenizer_add_eos = false;
  5592. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  5593. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5594. vocab.tokenizer_add_space_prefix = false;
  5595. vocab.tokenizer_clean_spaces = true;
  5596. vocab.tokenizer_add_bos = true;
  5597. vocab.tokenizer_add_eos = false;
  5598. } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
  5599. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5600. vocab.tokenizer_add_bos = false;
  5601. vocab.tokenizer_add_eos = true;
  5602. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  5603. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5604. vocab.tokenizer_add_space_prefix = false;
  5605. vocab.tokenizer_clean_spaces = false;
  5606. vocab.tokenizer_add_bos = false;
  5607. vocab.tokenizer_add_eos = false;
  5608. } else {
  5609. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  5610. }
  5611. ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
  5612. ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
  5613. }
  5614. const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
  5615. if (token_idx == -1) {
  5616. throw std::runtime_error("cannot find tokenizer vocab in model file\n");
  5617. }
  5618. const float * scores = nullptr;
  5619. const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
  5620. if (score_idx != -1) {
  5621. scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
  5622. }
  5623. const int * toktypes = nullptr;
  5624. const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
  5625. if (toktype_idx != -1) {
  5626. toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
  5627. }
  5628. const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
  5629. vocab.id_to_token.resize(n_vocab);
  5630. for (uint32_t i = 0; i < n_vocab; i++) {
  5631. std::string word = gguf_get_arr_str(ctx, token_idx, i);
  5632. GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
  5633. vocab.token_to_id[word] = i;
  5634. vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
  5635. auto & token_data = vocab.id_to_token[i];
  5636. token_data.text = std::move(word);
  5637. token_data.score = scores ? scores[i] : 0.0f;
  5638. token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
  5639. if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
  5640. switch(toktypes[i]) {
  5641. case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
  5642. case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
  5643. case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
  5644. case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
  5645. case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
  5646. case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
  5647. case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  5648. default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  5649. }
  5650. }
  5651. }
  5652. GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
  5653. // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
  5654. if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  5655. // For Fill-In-the-Middle (FIM)/infill models which where converted
  5656. // prior to support of FIM special tokens in GGUF, the following
  5657. // will allow those models to continue to work. The general names
  5658. // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
  5659. // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
  5660. // new versions of these models have been published.
  5661. std::string gen_name;
  5662. ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
  5663. std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
  5664. [](unsigned char c){ return std::tolower(c); });
  5665. if (gen_name.find("code") != std::string::npos) {
  5666. if (model.arch == LLM_ARCH_LLAMA
  5667. && 32010 < vocab.id_to_token.size()
  5668. && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
  5669. && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
  5670. && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
  5671. && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
  5672. vocab.special_prefix_id = 32007;
  5673. vocab.special_suffix_id = 32008;
  5674. vocab.special_middle_id = 32009;
  5675. vocab.special_eot_id = 32010;
  5676. } else if (model.arch == LLM_ARCH_GEMMA
  5677. && 107 < vocab.id_to_token.size()
  5678. && vocab.id_to_token[67].text == "<|fim_prefix|>"
  5679. && vocab.id_to_token[69].text == "<|fim_suffix|>"
  5680. && vocab.id_to_token[68].text == "<|fim_middle|>"
  5681. && vocab.id_to_token[107].text == "<end_of_turn>") {
  5682. vocab.special_prefix_id = 67;
  5683. vocab.special_suffix_id = 69;
  5684. vocab.special_middle_id = 68;
  5685. // TODO: this is not EOT, it is "file separator" token, needs fix
  5686. // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
  5687. //vocab.special_eot_id = 70;
  5688. vocab.special_eot_id = 107;
  5689. }
  5690. }
  5691. try {
  5692. vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
  5693. } catch (const std::exception & e) {
  5694. LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
  5695. vocab.linefeed_id = vocab.special_pad_id;
  5696. }
  5697. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  5698. vocab.linefeed_id = vocab.special_pad_id;
  5699. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  5700. const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
  5701. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  5702. vocab.linefeed_id = ids[0];
  5703. } else {
  5704. const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
  5705. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  5706. vocab.linefeed_id = ids[0];
  5707. }
  5708. // special tokens
  5709. {
  5710. const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
  5711. { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
  5712. { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
  5713. { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
  5714. { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
  5715. { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
  5716. { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
  5717. { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
  5718. { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
  5719. { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
  5720. { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
  5721. { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
  5722. { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
  5723. };
  5724. for (const auto & it : special_token_types) {
  5725. const std::string & key = kv(std::get<0>(it));
  5726. int32_t & id = std::get<1>(it);
  5727. uint32_t new_id;
  5728. if (!ml.get_key(std::get<0>(it), new_id, false)) {
  5729. continue;
  5730. }
  5731. if (new_id >= vocab.id_to_token.size()) {
  5732. LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
  5733. __func__, key.c_str(), new_id, id);
  5734. } else {
  5735. id = new_id;
  5736. }
  5737. }
  5738. // Handle add_bos_token and add_eos_token
  5739. {
  5740. bool temp = true;
  5741. if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
  5742. vocab.tokenizer_add_bos = temp;
  5743. }
  5744. if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
  5745. vocab.tokenizer_add_eos = temp;
  5746. }
  5747. }
  5748. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
  5749. //
  5750. // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
  5751. // for now, we apply this workaround to find the EOT token based on its text
  5752. if (vocab.special_eot_id == -1) {
  5753. for (const auto & t : vocab.token_to_id) {
  5754. if (
  5755. // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
  5756. // need to fix convert script
  5757. //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
  5758. (t.first == "<|eot_id|>" ||
  5759. t.first == "<|im_end|>" ||
  5760. t.first == "<|end|>" ||
  5761. t.first == "<end_of_turn>" ||
  5762. t.first == "<|endoftext|>"
  5763. )
  5764. ) {
  5765. vocab.special_eot_id = t.second;
  5766. break;
  5767. }
  5768. }
  5769. }
  5770. // find EOM token: "<|eom_id|>"
  5771. //
  5772. // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
  5773. // for now, we apply this workaround to find the EOM token based on its text
  5774. if (vocab.special_eom_id == -1) {
  5775. const auto & t = vocab.token_to_id.find("<|eom_id|>");
  5776. if (t != vocab.token_to_id.end()) {
  5777. vocab.special_eom_id = t->second;
  5778. }
  5779. }
  5780. }
  5781. // build special tokens cache
  5782. {
  5783. for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
  5784. if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
  5785. vocab.cache_special_tokens.push_back(id);
  5786. }
  5787. }
  5788. std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
  5789. [&] (const llama_vocab::id a, const llama_vocab::id b) {
  5790. return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
  5791. }
  5792. );
  5793. LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
  5794. }
  5795. // build token to piece cache
  5796. {
  5797. size_t size_cache = 0;
  5798. std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
  5799. for (uint32_t id = 0; id < n_vocab; ++id) {
  5800. cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
  5801. size_cache += cache_token_to_piece[id].size();
  5802. }
  5803. std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
  5804. LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
  5805. }
  5806. // Handle per token attributes
  5807. //NOTE: Each model customizes per token attributes.
  5808. //NOTE: Per token attributes are missing from the GGUF file.
  5809. //TODO: Extract attributes from GGUF file.
  5810. {
  5811. auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
  5812. for (auto substr : substrs) {
  5813. if (str.find(substr) < std::string::npos) {
  5814. return true;
  5815. }
  5816. }
  5817. return false;
  5818. };
  5819. auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
  5820. uint32_t current = vocab.id_to_token.at(id).attr;
  5821. current = value ? (current | attr) : (current & ~attr);
  5822. vocab.id_to_token[id].attr = (llama_token_attr) current;
  5823. };
  5824. auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
  5825. _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
  5826. };
  5827. std::string model_name;
  5828. std::string tokenizer_pre;
  5829. ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
  5830. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  5831. // model name to lowercase
  5832. std::transform(model_name.begin(), model_name.end(), model_name.begin(),
  5833. [] (const std::string::value_type x) {
  5834. return std::tolower(x);
  5835. }
  5836. );
  5837. // set attributes by model/tokenizer name
  5838. if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
  5839. _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
  5840. } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
  5841. for (auto id : vocab.cache_special_tokens) {
  5842. _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
  5843. }
  5844. for (auto token : {"</s>"}) {
  5845. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
  5846. }
  5847. for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
  5848. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
  5849. }
  5850. }
  5851. }
  5852. }
  5853. static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
  5854. const auto & hparams = model.hparams;
  5855. const auto & vocab = model.vocab;
  5856. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  5857. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  5858. bool is_var = false;
  5859. std::vector<uint32_t> v;
  5860. for (uint32_t i = 0; i < n; ++i) {
  5861. v.push_back(f(i));
  5862. if (v[i] != v[0]) {
  5863. is_var = true;
  5864. }
  5865. }
  5866. std::stringstream ss;
  5867. if (is_var) {
  5868. ss << "[";
  5869. for (uint32_t i = 0; i < n; ++i) {
  5870. ss << v[i];
  5871. if (i < n - 1) {
  5872. ss << ", ";
  5873. }
  5874. }
  5875. ss << "]";
  5876. } else {
  5877. ss << v[0];
  5878. }
  5879. return ss.str();
  5880. };
  5881. // hparams
  5882. LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
  5883. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
  5884. LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
  5885. LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
  5886. LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
  5887. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  5888. if (!hparams.vocab_only) {
  5889. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  5890. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  5891. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  5892. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  5893. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  5894. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  5895. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  5896. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  5897. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  5898. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  5899. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  5900. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  5901. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  5902. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  5903. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  5904. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  5905. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  5906. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  5907. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  5908. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  5909. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  5910. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  5911. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  5912. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  5913. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  5914. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  5915. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  5916. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  5917. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  5918. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  5919. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  5920. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  5921. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  5922. }
  5923. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
  5924. LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
  5925. if (ml.n_elements >= 1e12) {
  5926. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
  5927. } else if (ml.n_elements >= 1e9) {
  5928. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
  5929. } else if (ml.n_elements >= 1e6) {
  5930. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
  5931. } else {
  5932. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
  5933. }
  5934. if (ml.n_bytes < GiB) {
  5935. LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  5936. } else {
  5937. LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  5938. }
  5939. // general kv
  5940. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
  5941. // special tokens
  5942. if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
  5943. if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
  5944. if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
  5945. if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
  5946. if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
  5947. if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
  5948. if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
  5949. if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
  5950. if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
  5951. if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
  5952. if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
  5953. if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
  5954. LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
  5955. if (model.arch == LLM_ARCH_DEEPSEEK2) {
  5956. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  5957. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  5958. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  5959. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5960. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  5961. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  5962. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  5963. }
  5964. if (model.arch == LLM_ARCH_QWEN2MOE) {
  5965. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  5966. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  5967. }
  5968. }
  5969. // Returns false if cancelled by progress_callback
  5970. static bool llm_load_tensors(
  5971. llama_model_loader & ml,
  5972. llama_model & model,
  5973. int n_gpu_layers,
  5974. enum llama_split_mode split_mode,
  5975. int main_gpu,
  5976. const float * tensor_split,
  5977. bool use_mlock,
  5978. llama_progress_callback progress_callback,
  5979. void * progress_callback_user_data) {
  5980. model.t_start_us = ggml_time_us();
  5981. auto & hparams = model.hparams;
  5982. model.split_mode = split_mode;
  5983. model.main_gpu = main_gpu;
  5984. model.n_gpu_layers = n_gpu_layers;
  5985. const int n_layer = hparams.n_layer;
  5986. const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
  5987. bool use_mmap_buffer = true;
  5988. // there is very little benefit to offloading the input layer, so always keep it on the CPU
  5989. model.buft_input = llama_default_buffer_type_cpu(true);
  5990. //model.buft_input = llama_default_buffer_type_offload(main_gpu);
  5991. model.buft_layer.resize(n_layer);
  5992. // assign cpu layers
  5993. for (int i = 0; i < i_gpu_start; ++i) {
  5994. model.buft_layer[i] = llama_default_buffer_type_cpu(true);
  5995. }
  5996. if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
  5997. // calculate the split points
  5998. int device_count = llama_get_device_count(model);
  5999. bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
  6000. std::vector<float> splits(device_count);
  6001. if (all_zero) {
  6002. // default split, by free memory
  6003. for (int i = 0; i < device_count; ++i) {
  6004. splits[i] = llama_get_device_memory(model, i);
  6005. }
  6006. } else {
  6007. std::copy(tensor_split, tensor_split + device_count, splits.begin());
  6008. }
  6009. // sum and normalize the splits to get the split points
  6010. float split_sum = 0.0f;
  6011. for (int i = 0; i < device_count; ++i) {
  6012. split_sum += splits[i];
  6013. splits[i] = split_sum;
  6014. }
  6015. for (int i = 0; i < device_count; ++i) {
  6016. splits[i] /= split_sum;
  6017. }
  6018. // assign the repeating layers to the devices according to the splits
  6019. int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
  6020. for (int i = i_gpu_start; i < n_layer; ++i) {
  6021. int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
  6022. model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
  6023. }
  6024. // assign the output layer
  6025. if (n_gpu_layers > n_layer) {
  6026. int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
  6027. model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
  6028. } else {
  6029. model.buft_output = llama_default_buffer_type_cpu(true);
  6030. }
  6031. } else {
  6032. ggml_backend_buffer_type_t split_buft;
  6033. if (split_mode == LLAMA_SPLIT_MODE_ROW) {
  6034. split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
  6035. } else {
  6036. // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
  6037. split_buft = llama_default_buffer_type_offload(model, main_gpu);
  6038. }
  6039. // assign the repeating layers
  6040. for (int i = i_gpu_start; i < n_layer; ++i) {
  6041. model.buft_layer[i] = {
  6042. split_buft,
  6043. llama_default_buffer_type_offload(model, main_gpu)
  6044. };
  6045. }
  6046. // assign the output layer
  6047. if (n_gpu_layers > n_layer) {
  6048. model.buft_output = {
  6049. split_buft,
  6050. llama_default_buffer_type_offload(model, main_gpu)
  6051. };
  6052. } else {
  6053. model.buft_output = llama_default_buffer_type_cpu(true);
  6054. }
  6055. }
  6056. // count used buffer types
  6057. std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
  6058. buft_layer_count[model.buft_input.buft]++;
  6059. buft_layer_count[model.buft_input.buft_matrix]++;
  6060. buft_layer_count[model.buft_output.buft]++;
  6061. buft_layer_count[model.buft_output.buft_matrix]++;
  6062. for (int i = 0; i < n_layer; ++i) {
  6063. buft_layer_count[model.buft_layer[i].buft]++;
  6064. buft_layer_count[model.buft_layer[i].buft_matrix]++;
  6065. }
  6066. // create one context per buffer type
  6067. size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output
  6068. // for moe merged tensors
  6069. ctx_size += ggml_tensor_overhead()*n_layer*3;
  6070. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  6071. for (auto & it : buft_layer_count) {
  6072. struct ggml_init_params params = {
  6073. /*.mem_size =*/ ctx_size,
  6074. /*.mem_buffer =*/ NULL,
  6075. /*.no_alloc =*/ true,
  6076. };
  6077. ggml_context * ctx = ggml_init(params);
  6078. if (!ctx) {
  6079. throw std::runtime_error(format("failed to create context"));
  6080. }
  6081. ctx_map[it.first] = ctx;
  6082. model.ctxs.push_back(ctx);
  6083. }
  6084. LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
  6085. // create tensors for the weights
  6086. {
  6087. // note: cast to int64_t since we will use these for the tensor dimensions
  6088. const int64_t n_head = hparams.n_head();
  6089. const int64_t n_head_kv = hparams.n_head_kv();
  6090. const int64_t n_embd = hparams.n_embd;
  6091. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  6092. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  6093. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  6094. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  6095. const int64_t n_ff = hparams.n_ff();
  6096. const int64_t n_embd_gqa = n_embd_v_gqa;
  6097. const int64_t n_vocab = hparams.n_vocab;
  6098. const int64_t n_vocab_type = hparams.n_vocab_type;
  6099. const int64_t n_rot = hparams.n_rot;
  6100. const int64_t n_expert = hparams.n_expert;
  6101. const int64_t n_expert_used = hparams.n_expert_used;
  6102. const int64_t n_ctx_train = hparams.n_ctx_train;
  6103. if (n_expert > 0 && hparams.n_expert_used == 0) {
  6104. throw std::runtime_error("model has expert layers but no expert layers are used");
  6105. }
  6106. ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
  6107. ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
  6108. ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
  6109. auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
  6110. auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
  6111. model.layers.resize(n_layer);
  6112. const auto tn = LLM_TN(model.arch);
  6113. switch (model.arch) {
  6114. case LLM_ARCH_LLAMA:
  6115. case LLM_ARCH_REFACT:
  6116. case LLM_ARCH_MINICPM:
  6117. {
  6118. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6119. // output
  6120. {
  6121. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6122. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6123. // if output is NULL, init from the input tok embed
  6124. if (model.output == NULL) {
  6125. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6126. }
  6127. }
  6128. for (int i = 0; i < n_layer; ++i) {
  6129. ggml_context * ctx_layer = ctx_for_layer(i);
  6130. ggml_context * ctx_split = ctx_for_layer_split(i);
  6131. auto & layer = model.layers[i];
  6132. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6133. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
  6134. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  6135. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  6136. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  6137. // optional bias tensors
  6138. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6139. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6140. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6141. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6142. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6143. layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  6144. if (n_expert == 0) {
  6145. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6146. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6147. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6148. // optional MLP bias
  6149. layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6150. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6151. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6152. } else {
  6153. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  6154. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6155. if (layer.ffn_gate_exps) {
  6156. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
  6157. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
  6158. } else {
  6159. // merge split expert into a single tensor for compatibility with older models
  6160. // requires disabling mmap
  6161. use_mmap_buffer = false;
  6162. ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
  6163. ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
  6164. ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
  6165. layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
  6166. layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
  6167. layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
  6168. ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
  6169. ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
  6170. ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
  6171. for (uint32_t x = 0; x < n_expert; ++x) {
  6172. // the individual experts are loaded into a view of the merged tensor
  6173. ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
  6174. ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
  6175. ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
  6176. }
  6177. }
  6178. }
  6179. }
  6180. } break;
  6181. case LLM_ARCH_GROK:
  6182. {
  6183. if (n_expert == 0) {
  6184. throw std::runtime_error("Grok model cannot have zero experts");
  6185. }
  6186. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6187. // output
  6188. {
  6189. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6190. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6191. // if output is NULL, init from the input tok embed
  6192. if (model.output == NULL) {
  6193. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6194. }
  6195. }
  6196. for (int i = 0; i < n_layer; ++i) {
  6197. ggml_context * ctx_layer = ctx_for_layer(i);
  6198. ggml_context * ctx_split = ctx_for_layer_split(i);
  6199. auto & layer = model.layers[i];
  6200. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6201. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6202. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6203. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6204. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6205. layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
  6206. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6207. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  6208. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6209. if (layer.ffn_gate_exps) {
  6210. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
  6211. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
  6212. } else {
  6213. // merge split expert into a single tensor for compatibility with older models
  6214. // requires disabling mmap
  6215. use_mmap_buffer = false;
  6216. ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type;
  6217. ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type;
  6218. ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type;
  6219. layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert);
  6220. layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert);
  6221. layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert);
  6222. ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str());
  6223. ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str());
  6224. ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str());
  6225. for (uint32_t x = 0; x < n_expert; ++x) {
  6226. // the individual experts are loaded into a view of the merged tensor
  6227. ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x);
  6228. ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x);
  6229. ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x);
  6230. }
  6231. }
  6232. layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
  6233. }
  6234. } break;
  6235. case LLM_ARCH_DBRX:
  6236. {
  6237. if (n_expert == 0) {
  6238. throw std::runtime_error("DBRX model cannot have zero experts");
  6239. }
  6240. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6241. // output
  6242. {
  6243. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6244. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6245. }
  6246. for (int i = 0; i < n_layer; ++i) {
  6247. ggml_context * ctx_layer = ctx_for_layer(i);
  6248. ggml_context * ctx_split = ctx_for_layer_split(i);
  6249. auto & layer = model.layers[i];
  6250. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6251. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6252. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6253. layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
  6254. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  6255. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert});
  6256. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert});
  6257. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
  6258. }
  6259. } break;
  6260. case LLM_ARCH_BAICHUAN:
  6261. {
  6262. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6263. {
  6264. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6265. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6266. }
  6267. for (int i = 0; i < n_layer; ++i) {
  6268. ggml_context * ctx_layer = ctx_for_layer(i);
  6269. ggml_context * ctx_split = ctx_for_layer_split(i);
  6270. auto & layer = model.layers[i];
  6271. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6272. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6273. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6274. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6275. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6276. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6277. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6278. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6279. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6280. }
  6281. } break;
  6282. case LLM_ARCH_FALCON:
  6283. {
  6284. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6285. // output
  6286. {
  6287. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6288. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6289. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6290. if (!model.output) {
  6291. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
  6292. }
  6293. }
  6294. for (int i = 0; i < n_layer; ++i) {
  6295. ggml_context * ctx_layer = ctx_for_layer(i);
  6296. ggml_context * ctx_split = ctx_for_layer_split(i);
  6297. auto & layer = model.layers[i];
  6298. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6299. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6300. layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6301. layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6302. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6303. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6304. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6305. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6306. }
  6307. } break;
  6308. case LLM_ARCH_STARCODER:
  6309. {
  6310. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6311. model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
  6312. // output
  6313. {
  6314. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6315. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6316. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6317. if (!model.output) {
  6318. // needs to be on GPU
  6319. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6320. }
  6321. }
  6322. for (int i = 0; i < n_layer; ++i) {
  6323. ggml_context * ctx_layer = ctx_for_layer(i);
  6324. ggml_context * ctx_split = ctx_for_layer_split(i);
  6325. auto & layer = model.layers[i];
  6326. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6327. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6328. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6329. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  6330. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6331. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6332. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6333. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6334. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6335. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6336. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6337. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6338. }
  6339. } break;
  6340. case LLM_ARCH_BERT:
  6341. case LLM_ARCH_NOMIC_BERT:
  6342. {
  6343. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6344. model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type});
  6345. if (model.arch == LLM_ARCH_BERT) {
  6346. model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
  6347. }
  6348. model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
  6349. model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
  6350. for (int i = 0; i < n_layer; ++i) {
  6351. ggml_context * ctx_layer = ctx_for_layer(i);
  6352. ggml_context * ctx_split = ctx_for_layer_split(i);
  6353. auto & layer = model.layers[i];
  6354. if (model.arch == LLM_ARCH_BERT) {
  6355. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6356. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6357. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6358. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6359. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6360. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6361. } else {
  6362. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6363. }
  6364. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6365. layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd});
  6366. layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
  6367. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6368. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6369. if (model.arch == LLM_ARCH_BERT) {
  6370. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6371. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6372. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6373. } else {
  6374. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6375. }
  6376. layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
  6377. layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
  6378. }
  6379. } break;
  6380. case LLM_ARCH_JINA_BERT_V2:
  6381. {
  6382. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
  6383. model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); // token_type_embeddings
  6384. model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
  6385. model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
  6386. for (int i = 0; i < n_layer; ++i) {
  6387. ggml_context * ctx_layer = ctx_for_layer(i);
  6388. ggml_context * ctx_split = ctx_for_layer_split(i);
  6389. auto & layer = model.layers[i]; // JinaBertLayer
  6390. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6391. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6392. layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6393. layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6394. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6395. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6396. layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6397. layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6398. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6399. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6400. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
  6401. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
  6402. layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
  6403. layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
  6404. layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6405. layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6406. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6407. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6408. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6409. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6410. layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
  6411. layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
  6412. }
  6413. } break;
  6414. case LLM_ARCH_BLOOM:
  6415. {
  6416. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6417. model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
  6418. model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
  6419. // output
  6420. {
  6421. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6422. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6423. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6424. }
  6425. for (int i = 0; i < n_layer; ++i) {
  6426. ggml_context * ctx_layer = ctx_for_layer(i);
  6427. ggml_context * ctx_split = ctx_for_layer_split(i);
  6428. auto & layer = model.layers[i];
  6429. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6430. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6431. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6432. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  6433. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6434. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6435. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6436. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6437. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6438. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6439. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6440. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6441. }
  6442. } break;
  6443. case LLM_ARCH_MPT:
  6444. {
  6445. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6446. model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6447. // output
  6448. {
  6449. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6450. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6451. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6452. if (!model.output) {
  6453. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
  6454. }
  6455. }
  6456. for (int i = 0; i < n_layer; ++i) {
  6457. ggml_context * ctx_layer = ctx_for_layer(i);
  6458. ggml_context * ctx_split = ctx_for_layer_split(i);
  6459. auto & layer = model.layers[i];
  6460. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6461. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6462. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6463. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6464. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6465. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6466. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6467. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6468. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6469. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6470. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6471. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6472. layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6473. layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6474. layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6475. layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6476. // AWQ ScaleActivation layer
  6477. layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6478. }
  6479. } break;
  6480. case LLM_ARCH_STABLELM:
  6481. {
  6482. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6483. // output
  6484. {
  6485. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6486. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6487. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6488. }
  6489. for (int i = 0; i < n_layer; ++i) {
  6490. ggml_context * ctx_layer = ctx_for_layer(i);
  6491. ggml_context * ctx_split = ctx_for_layer_split(i);
  6492. auto & layer = model.layers[i];
  6493. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6494. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6495. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6496. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6497. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6498. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6499. // optional bias tensors, present in Stable LM 2 1.6B
  6500. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6501. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6502. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6503. // optional q and k layernorms, present in StableLM 2 12B
  6504. layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6505. layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6506. // optional FFN norm, not present in StableLM 2 12B which uses parallel residual
  6507. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6508. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6509. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6510. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6511. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6512. }
  6513. } break;
  6514. case LLM_ARCH_QWEN:
  6515. {
  6516. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6517. // output
  6518. {
  6519. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6520. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6521. }
  6522. for (int i = 0; i < n_layer; ++i) {
  6523. ggml_context * ctx_layer = ctx_for_layer(i);
  6524. ggml_context * ctx_split = ctx_for_layer_split(i);
  6525. auto & layer = model.layers[i];
  6526. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6527. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
  6528. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
  6529. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6530. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6531. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
  6532. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
  6533. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
  6534. }
  6535. } break;
  6536. case LLM_ARCH_QWEN2:
  6537. {
  6538. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6539. // output
  6540. {
  6541. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6542. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6543. // if output is NULL, init from the input tok embed
  6544. if (model.output == NULL) {
  6545. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6546. }
  6547. }
  6548. for (int i = 0; i < n_layer; ++i) {
  6549. ggml_context * ctx_layer = ctx_for_layer(i);
  6550. ggml_context * ctx_split = ctx_for_layer_split(i);
  6551. auto & layer = model.layers[i];
  6552. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6553. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6554. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6555. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6556. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6557. // optional bias tensors
  6558. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6559. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6560. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6561. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6562. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6563. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6564. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6565. }
  6566. } break;
  6567. case LLM_ARCH_QWEN2MOE:
  6568. {
  6569. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6570. // output
  6571. {
  6572. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6573. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6574. }
  6575. for (int i = 0; i < n_layer; ++i) {
  6576. ggml_context * ctx_layer = ctx_for_layer(i);
  6577. ggml_context * ctx_split = ctx_for_layer_split(i);
  6578. auto & layer = model.layers[i];
  6579. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6580. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6581. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6582. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6583. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6584. // optional bias tensors
  6585. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6586. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6587. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6588. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6589. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  6590. GGML_ASSERT(n_expert > 0);
  6591. GGML_ASSERT(n_expert_used > 0);
  6592. // MoE branch
  6593. const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
  6594. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
  6595. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
  6596. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
  6597. // Shared expert branch
  6598. const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
  6599. layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
  6600. layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp});
  6601. layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
  6602. layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp});
  6603. }
  6604. } break;
  6605. case LLM_ARCH_PHI2:
  6606. {
  6607. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6608. // output
  6609. {
  6610. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6611. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6612. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6613. model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
  6614. }
  6615. for (int i = 0; i < n_layer; ++i) {
  6616. ggml_context * ctx_layer = ctx_for_layer(i);
  6617. ggml_context * ctx_split = ctx_for_layer_split(i);
  6618. auto & layer = model.layers[i];
  6619. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6620. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6621. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6622. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6623. if (layer.wqkv == nullptr) {
  6624. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6625. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6626. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6627. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6628. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6629. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6630. }
  6631. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6632. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6633. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6634. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6635. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6636. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6637. }
  6638. } break;
  6639. case LLM_ARCH_PHI3:
  6640. {
  6641. const int64_t n_embd_head = n_embd / n_head;
  6642. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab });
  6643. // output
  6644. {
  6645. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd });
  6646. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab });
  6647. }
  6648. for (int i = 0; i < n_layer; ++i) {
  6649. ggml_context * ctx_layer = ctx_for_layer(i);
  6650. ggml_context * ctx_split = ctx_for_layer_split(i);
  6651. auto & layer = model.layers[i];
  6652. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
  6653. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
  6654. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
  6655. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
  6656. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
  6657. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
  6658. layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  6659. layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  6660. }
  6661. } break;
  6662. case LLM_ARCH_PLAMO:
  6663. {
  6664. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6665. // output
  6666. {
  6667. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6668. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6669. }
  6670. for (int i = 0; i < n_layer; ++i) {
  6671. ggml_context * ctx_layer = ctx_for_layer(i);
  6672. ggml_context * ctx_split = ctx_for_layer_split(i);
  6673. auto & layer = model.layers[i];
  6674. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6675. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6676. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6677. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6678. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6679. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6680. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6681. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6682. }
  6683. } break;
  6684. case LLM_ARCH_GPT2:
  6685. {
  6686. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6687. model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train});
  6688. // output
  6689. {
  6690. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6691. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6692. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6693. }
  6694. for (int i = 0; i < n_layer; ++i) {
  6695. ggml_context * ctx_layer = ctx_for_layer(i);
  6696. ggml_context * ctx_split = ctx_for_layer_split(i);
  6697. auto & layer = model.layers[i];
  6698. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6699. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6700. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6701. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  6702. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6703. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6704. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6705. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6706. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6707. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6708. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6709. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6710. }
  6711. } break;
  6712. case LLM_ARCH_CODESHELL:
  6713. {
  6714. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6715. // output
  6716. {
  6717. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6718. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6719. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6720. }
  6721. for (int i = 0; i < n_layer; ++i) {
  6722. ggml_context * ctx_layer = ctx_for_layer(i);
  6723. ggml_context * ctx_split = ctx_for_layer_split(i);
  6724. auto & layer = model.layers[i];
  6725. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6726. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6727. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6728. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  6729. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6730. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6731. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6732. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6733. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  6734. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6735. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6736. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  6737. }
  6738. } break;
  6739. case LLM_ARCH_ORION:
  6740. {
  6741. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6742. {
  6743. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6744. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6745. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6746. }
  6747. for (int i = 0; i < n_layer; ++i) {
  6748. ggml_context * ctx_layer = ctx_for_layer(i);
  6749. ggml_context * ctx_split = ctx_for_layer_split(i);
  6750. auto & layer = model.layers[i];
  6751. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6752. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6753. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6754. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6755. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6756. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6757. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6758. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6759. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6760. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6761. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6762. }
  6763. } break;
  6764. case LLM_ARCH_INTERNLM2:
  6765. {
  6766. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6767. // output
  6768. {
  6769. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6770. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6771. }
  6772. for (int i = 0; i < n_layer; ++i) {
  6773. ggml_context * ctx_layer = ctx_for_layer(i);
  6774. ggml_context * ctx_split = ctx_for_layer_split(i);
  6775. auto & layer = model.layers[i];
  6776. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6777. // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  6778. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6779. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6780. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6781. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6782. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6783. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6784. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6785. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6786. }
  6787. } break;
  6788. case LLM_ARCH_GEMMA:
  6789. {
  6790. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6791. // output
  6792. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6793. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  6794. for (int i = 0; i < n_layer; ++i) {
  6795. ggml_context * ctx_layer = ctx_for_layer(i);
  6796. ggml_context * ctx_split = ctx_for_layer_split(i);
  6797. auto & layer = model.layers[i];
  6798. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6799. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
  6800. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  6801. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  6802. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  6803. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6804. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6805. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6806. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6807. }
  6808. } break;
  6809. case LLM_ARCH_GEMMA2:
  6810. {
  6811. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6812. // output
  6813. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6814. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
  6815. for (int i = 0; i < n_layer; ++i) {
  6816. ggml_context * ctx_layer = ctx_for_layer(i);
  6817. ggml_context * ctx_split = ctx_for_layer_split(i);
  6818. auto & layer = model.layers[i];
  6819. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6820. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
  6821. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  6822. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  6823. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  6824. layer.attn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
  6825. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6826. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6827. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6828. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6829. layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
  6830. }
  6831. } break;
  6832. case LLM_ARCH_STARCODER2:
  6833. {
  6834. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6835. // output
  6836. {
  6837. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6838. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  6839. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6840. // if output is NULL, init from the input tok embed
  6841. if (model.output == NULL) {
  6842. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6843. }
  6844. }
  6845. for (int i = 0; i < n_layer; ++i) {
  6846. ggml_context * ctx_layer = ctx_for_layer(i);
  6847. ggml_context * ctx_split = ctx_for_layer_split(i);
  6848. auto & layer = model.layers[i];
  6849. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6850. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  6851. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6852. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6853. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6854. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6855. // optional bias tensors
  6856. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
  6857. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
  6858. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
  6859. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  6860. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6861. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  6862. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6863. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6864. // optional bias tensors
  6865. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  6866. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff});
  6867. }
  6868. } break;
  6869. case LLM_ARCH_MAMBA:
  6870. {
  6871. const int64_t d_conv = hparams.ssm_d_conv;
  6872. const int64_t d_inner = hparams.ssm_d_inner;
  6873. const int64_t d_state = hparams.ssm_d_state;
  6874. const int64_t dt_rank = hparams.ssm_dt_rank;
  6875. // only an expansion factor of 2 is supported for now
  6876. GGML_ASSERT(2 * n_embd == d_inner);
  6877. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6878. // output
  6879. {
  6880. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6881. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6882. // if output is NULL, init from the input tok embed, duplicated to allow offloading
  6883. if (model.output == NULL) {
  6884. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6885. }
  6886. }
  6887. for (int i = 0; i < n_layer; ++i) {
  6888. ggml_context * ctx_layer = ctx_for_layer(i);
  6889. ggml_context * ctx_split = ctx_for_layer_split(i);
  6890. auto & layer = model.layers[i];
  6891. // norm
  6892. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6893. layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
  6894. layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
  6895. layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
  6896. layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
  6897. layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
  6898. layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
  6899. // no "weight" suffix for these
  6900. layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
  6901. layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
  6902. // out_proj
  6903. layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
  6904. }
  6905. } break;
  6906. case LLM_ARCH_XVERSE:
  6907. {
  6908. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6909. {
  6910. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6911. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  6912. }
  6913. for (int i = 0; i < n_layer; ++i) {
  6914. ggml_context * ctx_layer = ctx_for_layer(i);
  6915. ggml_context * ctx_split = ctx_for_layer_split(i);
  6916. auto & layer = model.layers[i];
  6917. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6918. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6919. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6920. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6921. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6922. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  6923. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6924. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6925. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6926. }
  6927. } break;
  6928. case LLM_ARCH_COMMAND_R:
  6929. {
  6930. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6931. // output
  6932. {
  6933. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6934. // init output from the input tok embed
  6935. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6936. }
  6937. for (int i = 0; i < n_layer; ++i) {
  6938. ggml_context * ctx_layer = ctx_for_layer(i);
  6939. ggml_context * ctx_split = ctx_for_layer_split(i);
  6940. auto & layer = model.layers[i];
  6941. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6942. if (n_layer >= 64){
  6943. layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
  6944. layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
  6945. }
  6946. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6947. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6948. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6949. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6950. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6951. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6952. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6953. }
  6954. } break;
  6955. case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
  6956. {
  6957. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6958. // output
  6959. {
  6960. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  6961. // if output is NULL, init from the input tok embed
  6962. if (model.output == NULL) {
  6963. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6964. }
  6965. }
  6966. for (int i = 0; i < n_layer; ++i) {
  6967. ggml_context * ctx_split = ctx_for_layer_split(i);
  6968. auto & layer = model.layers[i];
  6969. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  6970. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  6971. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  6972. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  6973. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  6974. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  6975. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  6976. }
  6977. } break;
  6978. case LLM_ARCH_OPENELM:
  6979. {
  6980. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  6981. // output
  6982. {
  6983. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  6984. // init output from the input tok embed
  6985. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  6986. }
  6987. for (int i = 0; i < n_layer; ++i) {
  6988. const int64_t n_head = hparams.n_head(i);
  6989. const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
  6990. const int64_t n_ff = hparams.n_ff(i);
  6991. ggml_context * ctx_layer = ctx_for_layer(i);
  6992. ggml_context * ctx_split = ctx_for_layer_split(i);
  6993. auto & layer = model.layers[i];
  6994. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  6995. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k});
  6996. layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k});
  6997. layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
  6998. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd});
  6999. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7000. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7001. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  7002. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7003. }
  7004. } break;
  7005. case LLM_ARCH_GPTNEOX:
  7006. {
  7007. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7008. // output
  7009. {
  7010. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7011. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  7012. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7013. }
  7014. for (int i = 0; i < n_layer; ++i) {
  7015. ggml_context * ctx_layer = ctx_for_layer(i);
  7016. ggml_context * ctx_split = ctx_for_layer_split(i);
  7017. auto & layer = model.layers[i];
  7018. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7019. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  7020. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  7021. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  7022. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7023. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  7024. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7025. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  7026. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  7027. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  7028. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7029. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  7030. }
  7031. } break;
  7032. case LLM_ARCH_ARCTIC:
  7033. {
  7034. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7035. // output
  7036. {
  7037. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7038. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7039. // if output is NULL, init from the input tok embed
  7040. if (model.output == NULL) {
  7041. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  7042. }
  7043. }
  7044. for (int i = 0; i < n_layer; ++i) {
  7045. ggml_context * ctx_layer = ctx_for_layer(i);
  7046. ggml_context * ctx_split = ctx_for_layer_split(i);
  7047. auto & layer = model.layers[i];
  7048. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7049. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  7050. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  7051. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  7052. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7053. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7054. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
  7055. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
  7056. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
  7057. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  7058. layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
  7059. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
  7060. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
  7061. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
  7062. }
  7063. } break;
  7064. case LLM_ARCH_DEEPSEEK2:
  7065. {
  7066. const bool is_lite = (hparams.n_layer == 27);
  7067. const int64_t n_embd_head_qk_rope = hparams.n_rot;
  7068. const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  7069. const int64_t q_lora_rank = hparams.n_lora_q;
  7070. const int64_t kv_lora_rank = hparams.n_lora_kv;
  7071. const int64_t n_ff_exp = hparams.n_ff_exp;
  7072. const int64_t n_expert_shared = hparams.n_expert_shared;
  7073. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7074. // output
  7075. {
  7076. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7077. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7078. }
  7079. for (int i = 0; i < n_layer; ++i) {
  7080. ggml_context * ctx_layer = ctx_for_layer(i);
  7081. ggml_context * ctx_split = ctx_for_layer_split(i);
  7082. auto & layer = model.layers[i];
  7083. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7084. if (!is_lite) {
  7085. layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
  7086. }
  7087. layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
  7088. if (!is_lite) {
  7089. layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
  7090. layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
  7091. } else {
  7092. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
  7093. }
  7094. layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
  7095. layer.wkv_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
  7096. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd});
  7097. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7098. if (i < (int) hparams.n_layer_dense_lead) {
  7099. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7100. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7101. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7102. } else {
  7103. layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
  7104. GGML_ASSERT(n_expert > 0);
  7105. GGML_ASSERT(n_expert_used > 0);
  7106. // MoE branch
  7107. layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
  7108. layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
  7109. layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
  7110. // Shared expert branch
  7111. layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
  7112. layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd});
  7113. layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared});
  7114. }
  7115. }
  7116. } break;
  7117. case LLM_ARCH_BITNET:
  7118. {
  7119. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7120. // output
  7121. {
  7122. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7123. }
  7124. for (int i = 0; i < n_layer; ++i) {
  7125. ggml_context * ctx_layer = ctx_for_layer(i);
  7126. ggml_context * ctx_split = ctx_for_layer_split(i);
  7127. auto & layer = model.layers[i];
  7128. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7129. layer.attn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd});
  7130. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  7131. layer.wq_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "scale", i), {1});
  7132. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  7133. layer.wk_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "scale", i), {1});
  7134. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  7135. layer.wv_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "scale", i), {1});
  7136. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7137. layer.wo_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1});
  7138. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7139. layer.ffn_sub_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff});
  7140. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7141. layer.ffn_gate_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "scale", i), {1});
  7142. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  7143. layer.ffn_down_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1});
  7144. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7145. layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1});
  7146. }
  7147. } break;
  7148. case LLM_ARCH_T5:
  7149. {
  7150. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  7151. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7152. // output
  7153. {
  7154. model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
  7155. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd});
  7156. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7157. // if output is NULL, init from the input tok embed
  7158. if (model.output == NULL) {
  7159. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  7160. }
  7161. }
  7162. for (int i = 0; i < n_layer; ++i) {
  7163. ggml_context * ctx_layer = ctx_for_layer(i);
  7164. ggml_context * ctx_split = ctx_for_layer_split(i);
  7165. auto & layer = model.layers[i];
  7166. layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
  7167. layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7168. layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
  7169. layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7170. layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7171. layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
  7172. layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
  7173. layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7174. layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7175. layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
  7176. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd});
  7177. layer.attn_rel_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7178. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
  7179. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7180. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7181. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
  7182. layer.attn_norm_cross = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd});
  7183. // this tensor seems to be unused in HF transformers implementation
  7184. layer.attn_rel_b_cross = ml.create_tensor(ctx_input, tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7185. layer.wq_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
  7186. layer.wk_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7187. layer.wv_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7188. layer.wo_cross = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
  7189. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd});
  7190. layer.ffn_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7191. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7192. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff});
  7193. }
  7194. } break;
  7195. case LLM_ARCH_T5ENCODER:
  7196. {
  7197. const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
  7198. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7199. // output
  7200. {
  7201. model.output_norm_enc = ml.create_tensor(ctx_output, tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd});
  7202. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7203. // if output is NULL, init from the input tok embed
  7204. if (model.output == NULL) {
  7205. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
  7206. }
  7207. }
  7208. for (int i = 0; i < n_layer; ++i) {
  7209. ggml_context * ctx_layer = ctx_for_layer(i);
  7210. ggml_context * ctx_split = ctx_for_layer_split(i);
  7211. auto & layer = model.layers[i];
  7212. layer.attn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd});
  7213. layer.attn_rel_b_enc = ml.create_tensor(ctx_input, tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7214. layer.wq_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
  7215. layer.wk_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7216. layer.wv_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7217. layer.wo_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd});
  7218. layer.ffn_norm_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd});
  7219. layer.ffn_gate_enc = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7220. layer.ffn_down_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7221. layer.ffn_up_enc = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff});
  7222. }
  7223. } break;
  7224. case LLM_ARCH_JAIS:
  7225. {
  7226. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7227. // Output
  7228. {
  7229. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7230. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  7231. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7232. }
  7233. for (int i = 0; i < n_layer; ++i) {
  7234. ggml_context * ctx_layer = ctx_for_layer(i);
  7235. ggml_context * ctx_split = ctx_for_layer_split(i);
  7236. auto & layer = model.layers[i];
  7237. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7238. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  7239. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  7240. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  7241. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7242. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
  7243. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7244. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  7245. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  7246. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
  7247. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7248. layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff});
  7249. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7250. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
  7251. }
  7252. } break;
  7253. case LLM_ARCH_CHATGLM:
  7254. {
  7255. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7256. // output
  7257. {
  7258. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7259. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7260. }
  7261. for (int i = 0; i < n_layer; ++i) {
  7262. ggml_context * ctx_layer = ctx_for_layer(i);
  7263. ggml_context * ctx_split = ctx_for_layer_split(i);
  7264. auto & layer = model.layers[i];
  7265. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7266. layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
  7267. layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
  7268. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7269. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7270. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2});
  7271. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
  7272. }
  7273. } break;
  7274. case LLM_ARCH_NEMOTRON:
  7275. {
  7276. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7277. // output
  7278. {
  7279. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7280. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  7281. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7282. }
  7283. for (int i = 0; i < n_layer; ++i) {
  7284. ggml_context * ctx_layer = ctx_for_layer(i);
  7285. ggml_context * ctx_split = ctx_for_layer_split(i);
  7286. auto & layer = model.layers[i];
  7287. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7288. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  7289. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
  7290. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
  7291. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
  7292. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
  7293. // optional bias tensors
  7294. layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7295. layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7296. layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7297. layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7298. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7299. layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
  7300. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7301. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7302. // optional MLP bias
  7303. layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7304. layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7305. }
  7306. } break;
  7307. case LLM_ARCH_EXAONE:
  7308. {
  7309. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7310. // output
  7311. {
  7312. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7313. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7314. }
  7315. for (int i = 0; i < n_layer; ++i) {
  7316. ggml_context * ctx_layer = ctx_for_layer(i);
  7317. ggml_context * ctx_split = ctx_for_layer_split(i);
  7318. auto & layer = model.layers[i];
  7319. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7320. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
  7321. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7322. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7323. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  7324. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7325. layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  7326. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7327. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7328. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7329. }
  7330. } break;
  7331. case LLM_ARCH_RWKV6:
  7332. {
  7333. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7334. // Block 0, LN0
  7335. model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
  7336. model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
  7337. // output
  7338. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7339. model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
  7340. model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
  7341. const int time_mix_extra_dim = hparams.time_mix_extra_dim;
  7342. const int time_decay_extra_dim = hparams.time_decay_extra_dim;
  7343. const int head_size = hparams.wkv_head_size;
  7344. const int attn_hidden_size = n_embd;
  7345. const int ffn_size = hparams.n_ff_arr[0];
  7346. for (int i = 0; i < n_layer; ++i) {
  7347. ggml_context * ctx_layer = ctx_for_layer(i);
  7348. auto & layer = model.layers[i];
  7349. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7350. layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
  7351. layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
  7352. layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
  7353. layer.time_mix_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5});
  7354. layer.time_mix_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5});
  7355. layer.time_mix_lerp_x = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1});
  7356. layer.time_mix_lerp_w = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1});
  7357. layer.time_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
  7358. layer.time_mix_lerp_v = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1});
  7359. layer.time_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
  7360. layer.time_mix_lerp_g = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1});
  7361. layer.time_mix_first = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size});
  7362. layer.time_mix_decay = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd});
  7363. layer.time_mix_decay_w1 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim});
  7364. layer.time_mix_decay_w2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size});
  7365. layer.time_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd});
  7366. layer.time_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd});
  7367. layer.time_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd});
  7368. layer.time_mix_gate = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd});
  7369. layer.time_mix_ln = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd});
  7370. layer.time_mix_ln_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd});
  7371. layer.time_mix_output = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size});
  7372. layer.channel_mix_lerp_k = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1});
  7373. layer.channel_mix_lerp_r = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1});
  7374. layer.channel_mix_key = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size});
  7375. layer.channel_mix_value = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd});
  7376. layer.channel_mix_receptance = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd});
  7377. }
  7378. } break;
  7379. case LLM_ARCH_SOLAR:
  7380. {
  7381. model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  7382. // output
  7383. {
  7384. model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
  7385. model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
  7386. }
  7387. for (int i = 0; i < n_layer; ++i) {
  7388. ggml_context * ctx_layer = ctx_for_layer(i);
  7389. ggml_context * ctx_split = ctx_for_layer_split(i);
  7390. auto & layer = model.layers[i];
  7391. layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  7392. layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
  7393. layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
  7394. layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
  7395. layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
  7396. layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  7397. layer.bskcn_tv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_BSKCN_TV, "weight"), {2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
  7398. layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
  7399. layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  7400. layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
  7401. }
  7402. } break;
  7403. default:
  7404. throw std::runtime_error("unknown architecture");
  7405. }
  7406. }
  7407. ml.done_getting_tensors();
  7408. ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr);
  7409. model.mappings.reserve(ml.mappings.size());
  7410. // create the backend buffers
  7411. std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
  7412. ctx_bufs.reserve(ctx_map.size());
  7413. // Ensure we have enough capacity for the maximum backend buffer we will potentially create
  7414. size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
  7415. model.bufs.reserve(n_max_backend_buffer);
  7416. for (auto & it : ctx_map) {
  7417. ggml_backend_buffer_type_t buft = it.first;
  7418. ggml_context * ctx = it.second;
  7419. llama_buf_map bufs;
  7420. bufs.reserve(n_max_backend_buffer);
  7421. // only the mmap region containing the tensors in the model is mapped to the backend buffer
  7422. // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
  7423. // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
  7424. if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) {
  7425. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  7426. void * addr = nullptr;
  7427. size_t first, last;
  7428. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  7429. if (first >= last) {
  7430. continue;
  7431. }
  7432. ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
  7433. if (buf == nullptr) {
  7434. throw std::runtime_error("unable to allocate backend CPU buffer");
  7435. }
  7436. model.bufs.push_back(buf);
  7437. bufs.emplace(idx, buf);
  7438. #ifdef GGML_USE_CUDA
  7439. if (n_layer >= n_gpu_layers) {
  7440. ggml_backend_cuda_register_host_buffer(
  7441. ggml_backend_buffer_get_base(buf),
  7442. ggml_backend_buffer_get_size(buf));
  7443. }
  7444. #endif
  7445. }
  7446. }
  7447. #ifdef GGML_USE_METAL
  7448. else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
  7449. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  7450. const size_t max_size = ggml_get_max_tensor_size(ctx);
  7451. void * addr = nullptr;
  7452. size_t first, last;
  7453. ml.get_mapping_range(&first, &last, &addr, idx, ctx);
  7454. if (first >= last) {
  7455. continue;
  7456. }
  7457. ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
  7458. if (buf == nullptr) {
  7459. throw std::runtime_error("unable to allocate backend metal buffer");
  7460. }
  7461. model.bufs.push_back(buf);
  7462. bufs.emplace(idx, buf);
  7463. }
  7464. }
  7465. #endif
  7466. else {
  7467. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  7468. if (buf == nullptr) {
  7469. throw std::runtime_error("unable to allocate backend buffer");
  7470. }
  7471. model.bufs.push_back(buf);
  7472. if (use_mlock && ggml_backend_buffer_is_host(buf)) {
  7473. model.mlock_bufs.emplace_back(new llama_mlock);
  7474. auto & mlock_buf = model.mlock_bufs.back();
  7475. mlock_buf->init (ggml_backend_buffer_get_base(buf));
  7476. mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
  7477. }
  7478. for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
  7479. bufs.emplace(idx, buf);
  7480. }
  7481. }
  7482. if (bufs.empty()) {
  7483. throw std::runtime_error("failed to allocate buffer");
  7484. }
  7485. for (auto & buf : bufs) {
  7486. // indicate that this buffer contains weights
  7487. // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
  7488. ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  7489. }
  7490. ctx_bufs.emplace_back(ctx, bufs);
  7491. }
  7492. if (llama_supports_gpu_offload()) {
  7493. const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  7494. LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
  7495. if (n_gpu_layers > (int) hparams.n_layer) {
  7496. LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
  7497. }
  7498. const int max_backend_supported_layers = hparams.n_layer + 1;
  7499. const int max_offloadable_layers = hparams.n_layer + 1;
  7500. LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
  7501. }
  7502. // print memory requirements
  7503. for (ggml_backend_buffer_t buf : model.bufs) {
  7504. LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
  7505. }
  7506. // populate tensors_by_name
  7507. for (ggml_context * ctx : model.ctxs) {
  7508. for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
  7509. model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
  7510. }
  7511. }
  7512. // load tensor data
  7513. for (auto & it : ctx_bufs) {
  7514. ggml_context * ctx = it.first;
  7515. auto & bufs = it.second;
  7516. if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
  7517. return false;
  7518. }
  7519. }
  7520. if (use_mmap_buffer) {
  7521. for (auto & mapping : ml.mappings) {
  7522. model.mappings.emplace_back(std::move(mapping));
  7523. }
  7524. }
  7525. // loading time will be recalculate after the first eval, so
  7526. // we take page faults deferred by mmap() into consideration
  7527. model.t_load_us = ggml_time_us() - model.t_start_us;
  7528. return true;
  7529. }
  7530. // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
  7531. static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
  7532. try {
  7533. llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
  7534. model.hparams.vocab_only = params.vocab_only;
  7535. try {
  7536. llm_load_arch(ml, model);
  7537. } catch(const std::exception & e) {
  7538. throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
  7539. }
  7540. try {
  7541. llm_load_hparams(ml, model);
  7542. } catch(const std::exception & e) {
  7543. throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
  7544. }
  7545. try {
  7546. llm_load_vocab(ml, model);
  7547. } catch(const std::exception & e) {
  7548. throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
  7549. }
  7550. llm_load_print_meta(ml, model);
  7551. if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
  7552. model.hparams.n_vocab != model.vocab.id_to_token.size()) {
  7553. throw std::runtime_error("vocab size mismatch");
  7554. }
  7555. if (params.vocab_only) {
  7556. LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
  7557. return 0;
  7558. }
  7559. #ifdef GGML_USE_KOMPUTE
  7560. if (params.n_gpu_layers > 0 && (
  7561. !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
  7562. || !(
  7563. model.ftype == LLAMA_FTYPE_ALL_F32 ||
  7564. model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
  7565. model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
  7566. model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
  7567. model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
  7568. )
  7569. )) {
  7570. // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file
  7571. LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
  7572. params.n_gpu_layers = 0;
  7573. }
  7574. #endif
  7575. if (!llm_load_tensors(
  7576. ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
  7577. params.progress_callback, params.progress_callback_user_data
  7578. )) {
  7579. return -2;
  7580. }
  7581. } catch (const std::exception & err) {
  7582. LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
  7583. return -1;
  7584. }
  7585. return 0;
  7586. }
  7587. //
  7588. // llm_build
  7589. //
  7590. using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
  7591. enum llm_ffn_op_type {
  7592. LLM_FFN_SILU,
  7593. LLM_FFN_GELU,
  7594. LLM_FFN_RELU,
  7595. LLM_FFN_RELU_SQR,
  7596. LLM_FFN_SWIGLU,
  7597. };
  7598. enum llm_ffn_gate_type {
  7599. LLM_FFN_SEQ,
  7600. LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
  7601. };
  7602. enum llm_norm_type {
  7603. LLM_NORM,
  7604. LLM_NORM_RMS,
  7605. };
  7606. static struct ggml_tensor * llm_build_inp_embd(
  7607. struct ggml_context * ctx,
  7608. struct llama_context & lctx,
  7609. const llama_hparams & hparams,
  7610. const llama_ubatch & batch,
  7611. struct ggml_tensor * tok_embd,
  7612. const llm_build_cb & cb) {
  7613. const int64_t n_embd = hparams.n_embd;
  7614. struct ggml_tensor * inpL;
  7615. if (batch.token) {
  7616. lctx.inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
  7617. cb(lctx.inp_tokens, "inp_tokens", -1);
  7618. ggml_set_input(lctx.inp_tokens);
  7619. inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
  7620. } else {
  7621. lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
  7622. inpL = lctx.inp_embd;
  7623. ggml_set_input(lctx.inp_embd);
  7624. }
  7625. cb(inpL, "inp_embd", -1);
  7626. return inpL;
  7627. }
  7628. static void llm_build_kv_store(
  7629. struct ggml_context * ctx,
  7630. const llama_hparams & hparams,
  7631. const llama_cparams & cparams,
  7632. const llama_kv_cache & kv,
  7633. struct ggml_cgraph * graph,
  7634. struct ggml_tensor * k_cur,
  7635. struct ggml_tensor * v_cur,
  7636. int32_t n_tokens,
  7637. int32_t kv_head,
  7638. const llm_build_cb & cb,
  7639. int64_t il) {
  7640. const int64_t n_ctx = cparams.n_ctx;
  7641. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  7642. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  7643. GGML_ASSERT(kv.size == n_ctx);
  7644. struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa)*kv_head);
  7645. cb(k_cache_view, "k_cache_view", il);
  7646. // note: storing RoPE-ed version of K in the KV cache
  7647. ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
  7648. assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
  7649. struct ggml_tensor * v_cache_view = nullptr;
  7650. if (cparams.flash_attn) {
  7651. v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)*kv_head);
  7652. } else {
  7653. // note: the V cache is transposed when not using flash attention
  7654. v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
  7655. ( n_ctx)*ggml_element_size(kv.v_l[il]),
  7656. (kv_head)*ggml_element_size(kv.v_l[il]));
  7657. v_cur = ggml_transpose(ctx, v_cur);
  7658. }
  7659. cb(v_cache_view, "v_cache_view", il);
  7660. ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
  7661. }
  7662. // do mat_mul, while optionally apply lora
  7663. static struct ggml_tensor * llm_build_lora_mm(
  7664. struct llama_context & lctx,
  7665. struct ggml_context * ctx0,
  7666. struct ggml_tensor * w,
  7667. struct ggml_tensor * cur) {
  7668. struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
  7669. for (auto & it : lctx.lora_adapters) {
  7670. struct llama_lora_weight * lora = it.first->get_weight(w);
  7671. if (lora == nullptr) {
  7672. continue;
  7673. }
  7674. const float alpha = it.first->alpha;
  7675. const float rank = (float) lora->b->ne[0];
  7676. const float scale = alpha ? it.second * alpha / rank : it.second;
  7677. struct ggml_tensor * ab_cur = ggml_mul_mat(
  7678. ctx0, lora->b,
  7679. ggml_mul_mat(ctx0, lora->a, cur)
  7680. );
  7681. ab_cur = ggml_scale(ctx0, ab_cur, scale);
  7682. res = ggml_add(ctx0, res, ab_cur);
  7683. }
  7684. return res;
  7685. }
  7686. // do mat_mul_id, while optionally apply lora
  7687. static struct ggml_tensor * llm_build_lora_mm_id(
  7688. struct llama_context & lctx,
  7689. struct ggml_context * ctx0,
  7690. struct ggml_tensor * w, // struct ggml_tensor * as
  7691. struct ggml_tensor * cur, // struct ggml_tensor * b
  7692. struct ggml_tensor * ids) {
  7693. struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
  7694. for (auto & it : lctx.lora_adapters) {
  7695. struct llama_lora_weight * lora = it.first->get_weight(w);
  7696. if (lora == nullptr) {
  7697. continue;
  7698. }
  7699. const float alpha = it.first->alpha;
  7700. const float rank = (float) lora->b->ne[0];
  7701. const float scale = alpha ? it.second * alpha / rank : it.second;
  7702. struct ggml_tensor * ab_cur = ggml_mul_mat_id(
  7703. ctx0, lora->b,
  7704. ggml_mul_mat_id(ctx0, lora->a, cur, ids),
  7705. ids
  7706. );
  7707. ab_cur = ggml_scale(ctx0, ab_cur, scale);
  7708. res = ggml_add(ctx0, res, ab_cur);
  7709. }
  7710. return res;
  7711. }
  7712. static struct ggml_tensor * llm_build_norm(
  7713. struct ggml_context * ctx,
  7714. struct ggml_tensor * cur,
  7715. const llama_hparams & hparams,
  7716. struct ggml_tensor * mw,
  7717. struct ggml_tensor * mb,
  7718. llm_norm_type type,
  7719. const llm_build_cb & cb,
  7720. int il) {
  7721. switch (type) {
  7722. case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
  7723. case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break;
  7724. }
  7725. if (mw || mb) {
  7726. cb(cur, "norm", il);
  7727. }
  7728. if (mw) {
  7729. cur = ggml_mul(ctx, cur, mw);
  7730. if (mb) {
  7731. cb(cur, "norm_w", il);
  7732. }
  7733. }
  7734. if (mb) {
  7735. cur = ggml_add(ctx, cur, mb);
  7736. }
  7737. return cur;
  7738. }
  7739. static struct ggml_tensor * llm_build_ffn(
  7740. struct ggml_context * ctx,
  7741. struct llama_context & lctx,
  7742. struct ggml_tensor * cur,
  7743. struct ggml_tensor * up,
  7744. struct ggml_tensor * up_b,
  7745. struct ggml_tensor * up_s,
  7746. struct ggml_tensor * gate,
  7747. struct ggml_tensor * gate_b,
  7748. struct ggml_tensor * gate_s,
  7749. struct ggml_tensor * down,
  7750. struct ggml_tensor * down_b,
  7751. struct ggml_tensor * down_s,
  7752. struct ggml_tensor * act_scales,
  7753. llm_ffn_op_type type_op,
  7754. llm_ffn_gate_type type_gate,
  7755. const llm_build_cb & cb,
  7756. int il) {
  7757. struct ggml_tensor * tmp = up ? llm_build_lora_mm(lctx, ctx, up, cur) : cur;
  7758. cb(tmp, "ffn_up", il);
  7759. if (up_b) {
  7760. tmp = ggml_add(ctx, tmp, up_b);
  7761. cb(tmp, "ffn_up_b", il);
  7762. }
  7763. if (up_s) {
  7764. tmp = ggml_mul(ctx, tmp, up_s);
  7765. cb(tmp, "ffn_up_s", il);
  7766. }
  7767. if (gate) {
  7768. switch (type_gate) {
  7769. case LLM_FFN_SEQ:
  7770. {
  7771. cur = llm_build_lora_mm(lctx, ctx, gate, tmp);
  7772. cb(cur, "ffn_gate", il);
  7773. } break;
  7774. case LLM_FFN_PAR:
  7775. {
  7776. cur = llm_build_lora_mm(lctx, ctx, gate, cur);
  7777. cb(cur, "ffn_gate", il);
  7778. } break;
  7779. }
  7780. if (gate_b) {
  7781. cur = ggml_add(ctx, cur, gate_b);
  7782. cb(cur, "ffn_gate_b", il);
  7783. }
  7784. if (gate_s) {
  7785. cur = ggml_mul(ctx, cur, gate_s);
  7786. cb(cur, "ffn_gate_s", il);
  7787. }
  7788. } else {
  7789. cur = tmp;
  7790. }
  7791. switch (type_op) {
  7792. case LLM_FFN_SILU:
  7793. {
  7794. cur = ggml_silu(ctx, cur);
  7795. cb(cur, "ffn_silu", il);
  7796. } break;
  7797. case LLM_FFN_GELU:
  7798. {
  7799. cur = ggml_gelu(ctx, cur);
  7800. cb(cur, "ffn_gelu", il);
  7801. if (act_scales != NULL) {
  7802. cur = ggml_div(ctx, cur, act_scales);
  7803. cb(cur, "ffn_act", il);
  7804. }
  7805. } break;
  7806. case LLM_FFN_RELU:
  7807. {
  7808. cur = ggml_relu(ctx, cur);
  7809. cb(cur, "ffn_relu", il);
  7810. } break;
  7811. case LLM_FFN_RELU_SQR:
  7812. {
  7813. cur = ggml_relu(ctx, cur);
  7814. cb(cur, "ffn_relu", il);
  7815. cur = ggml_sqr(ctx, cur);
  7816. cb(cur, "ffn_sqr(relu)", il);
  7817. } break;
  7818. case LLM_FFN_SWIGLU:
  7819. {
  7820. // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
  7821. int64_t split_point = cur->ne[0] / 2;
  7822. struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
  7823. struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
  7824. x0 = ggml_silu(ctx, x0);
  7825. cb(cur, "ffn_silu", il);
  7826. cur = ggml_mul(ctx, x0, x1);
  7827. cb(cur, "ffn_mul", il);
  7828. } break;
  7829. }
  7830. if (type_gate == LLM_FFN_PAR) {
  7831. cur = ggml_mul(ctx, cur, tmp);
  7832. cb(cur, "ffn_gate_par", il);
  7833. }
  7834. if (down) {
  7835. cur = llm_build_lora_mm(lctx, ctx, down, cur);
  7836. }
  7837. if (down_b) {
  7838. cb(cur, "ffn_down", il);
  7839. }
  7840. if (down_b) {
  7841. cur = ggml_add(ctx, cur, down_b);
  7842. }
  7843. if (down_s) {
  7844. cur = ggml_mul(ctx, cur, down_s);
  7845. cb(cur, "ffn_down_s", il);
  7846. }
  7847. return cur;
  7848. }
  7849. static struct ggml_tensor * llm_build_moe_ffn(
  7850. struct ggml_context * ctx,
  7851. struct llama_context & lctx,
  7852. struct ggml_tensor * cur,
  7853. struct ggml_tensor * gate_inp,
  7854. struct ggml_tensor * up_exps,
  7855. struct ggml_tensor * gate_exps,
  7856. struct ggml_tensor * down_exps,
  7857. int64_t n_expert,
  7858. int64_t n_expert_used,
  7859. llm_ffn_op_type type_op,
  7860. bool norm_w,
  7861. bool scale_w,
  7862. float w_scale,
  7863. const llm_build_cb & cb,
  7864. int il) {
  7865. int64_t n_embd = cur->ne[0];
  7866. int64_t n_tokens = cur->ne[1];
  7867. ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
  7868. cb(logits, "ffn_moe_logits", il);
  7869. ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
  7870. cb(probs, "ffn_moe_probs", il);
  7871. // select experts
  7872. ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
  7873. cb(selected_experts->src[0], "ffn_moe_argsort", il);
  7874. cb(selected_experts, "ffn_moe_topk", il);
  7875. ggml_tensor * weights = ggml_get_rows(ctx,
  7876. ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens]
  7877. cb(weights, "ffn_moe_weights", il);
  7878. if (norm_w) {
  7879. weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens);
  7880. ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens]
  7881. cb(weights_sum, "ffn_moe_weights_sum", il);
  7882. weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens]
  7883. cb(weights, "ffn_moe_weights_norm", il);
  7884. weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
  7885. }
  7886. if (scale_w) {
  7887. weights = ggml_scale(ctx, weights, w_scale);
  7888. cb(weights, "ffn_moe_weights_scaled", il);
  7889. }
  7890. cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
  7891. ggml_tensor * up = llm_build_lora_mm_id(lctx, ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
  7892. cb(up, "ffn_moe_up", il);
  7893. ggml_tensor * gate = llm_build_lora_mm_id(lctx, ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
  7894. cb(gate, "ffn_moe_gate", il);
  7895. switch (type_op) {
  7896. case LLM_FFN_SILU:
  7897. {
  7898. gate = ggml_silu(ctx, gate);
  7899. cb(gate, "ffn_moe_silu", il);
  7900. } break;
  7901. case LLM_FFN_GELU:
  7902. {
  7903. gate = ggml_gelu(ctx, gate);
  7904. cb(gate, "ffn_moe_gelu", il);
  7905. } break;
  7906. default:
  7907. GGML_ABORT("fatal error");
  7908. }
  7909. ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens]
  7910. cb(par, "ffn_moe_gate_par", il);
  7911. ggml_tensor * experts = llm_build_lora_mm_id(lctx, ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
  7912. cb(experts, "ffn_moe_down", il);
  7913. experts = ggml_mul(ctx, experts, weights);
  7914. // aggregate experts
  7915. ggml_tensor * moe_out = nullptr;
  7916. for (int i = 0; i < n_expert_used; ++i) {
  7917. ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
  7918. experts->nb[2], i*experts->nb[1]);
  7919. if (i == 0) {
  7920. moe_out = cur_expert;
  7921. } else {
  7922. moe_out = ggml_add(ctx, moe_out, cur_expert);
  7923. }
  7924. }
  7925. if (n_expert_used == 1) {
  7926. // avoid returning a non-contiguous tensor
  7927. moe_out = ggml_cont(ctx, moe_out);
  7928. }
  7929. return moe_out;
  7930. }
  7931. static struct ggml_tensor * llm_build_kqv(
  7932. struct ggml_context * ctx,
  7933. struct llama_context & lctx,
  7934. const llama_kv_cache & kv,
  7935. struct ggml_cgraph * graph,
  7936. struct ggml_tensor * wo,
  7937. struct ggml_tensor * wo_b,
  7938. struct ggml_tensor * q_cur,
  7939. struct ggml_tensor * kq_mask,
  7940. int32_t n_tokens,
  7941. int32_t n_kv,
  7942. float kq_scale,
  7943. const llm_build_cb & cb,
  7944. int il) {
  7945. const llama_model & model = lctx.model;
  7946. const llama_hparams & hparams = lctx.model.hparams;
  7947. const llama_cparams & cparams = lctx.cparams;
  7948. const int64_t n_ctx = cparams.n_ctx;
  7949. const int64_t n_head = hparams.n_head(il);
  7950. const int64_t n_head_kv = hparams.n_head_kv(il);
  7951. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  7952. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  7953. const int64_t n_embd_head_v = hparams.n_embd_head_v;
  7954. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  7955. struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
  7956. cb(q, "q", il);
  7957. struct ggml_tensor * k =
  7958. ggml_view_3d(ctx, kv.k_l[il],
  7959. n_embd_head_k, n_kv, n_head_kv,
  7960. ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
  7961. ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
  7962. 0);
  7963. cb(k, "k", il);
  7964. struct ggml_tensor * cur;
  7965. if (cparams.flash_attn) {
  7966. GGML_UNUSED(model);
  7967. GGML_UNUSED(n_ctx);
  7968. // split cached v into n_head heads (not transposed)
  7969. struct ggml_tensor * v =
  7970. ggml_view_3d(ctx, kv.v_l[il],
  7971. n_embd_head_v, n_kv, n_head_kv,
  7972. ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
  7973. ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
  7974. 0);
  7975. cb(v, "v", il);
  7976. cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
  7977. hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
  7978. if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_GEMMA2) {
  7979. ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
  7980. }
  7981. cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
  7982. } else {
  7983. struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
  7984. cb(kq, "kq", il);
  7985. if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2 || model.arch == LLM_ARCH_NEMOTRON || model.arch == LLM_ARCH_CHATGLM) {
  7986. // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
  7987. // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
  7988. ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
  7989. }
  7990. if (model.arch == LLM_ARCH_GROK) {
  7991. // need to do the following:
  7992. // multiply by attn_output_multiplyer of 0.08838834764831845
  7993. // and then :
  7994. // kq = 30 * tanh(kq / 30)
  7995. // before the softmax below
  7996. //try from phi2
  7997. //ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
  7998. kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f));
  7999. kq = ggml_scale(ctx, kq, 30);
  8000. }
  8001. if (hparams.attn_soft_cap) {
  8002. kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping);
  8003. kq = ggml_tanh(ctx, kq);
  8004. kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping);
  8005. }
  8006. kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
  8007. cb(kq, "kq_soft_max_ext", il);
  8008. GGML_ASSERT(kv.size == n_ctx);
  8009. // split cached v into n_head heads
  8010. struct ggml_tensor * v =
  8011. ggml_view_3d(ctx, kv.v_l[il],
  8012. n_kv, n_embd_head_v, n_head_kv,
  8013. ggml_element_size(kv.v_l[il])*n_ctx,
  8014. ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
  8015. 0);
  8016. cb(v, "v", il);
  8017. struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
  8018. cb(kqv, "kqv", il);
  8019. struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
  8020. cb(kqv_merged, "kqv_merged", il);
  8021. cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
  8022. cb(cur, "kqv_merged_cont", il);
  8023. }
  8024. ggml_build_forward_expand(graph, cur);
  8025. if (wo) {
  8026. cur = llm_build_lora_mm(lctx, ctx, wo, cur);
  8027. }
  8028. if (wo_b) {
  8029. cb(cur, "kqv_wo", il);
  8030. }
  8031. if (wo_b) {
  8032. cur = ggml_add(ctx, cur, wo_b);
  8033. }
  8034. return cur;
  8035. }
  8036. static struct ggml_tensor * llm_build_kv(
  8037. struct ggml_context * ctx,
  8038. struct llama_context & lctx,
  8039. const llama_kv_cache & kv,
  8040. struct ggml_cgraph * graph,
  8041. struct ggml_tensor * wo,
  8042. struct ggml_tensor * wo_b,
  8043. struct ggml_tensor * k_cur,
  8044. struct ggml_tensor * v_cur,
  8045. struct ggml_tensor * q_cur,
  8046. struct ggml_tensor * kq_mask,
  8047. int32_t n_tokens,
  8048. int32_t kv_head,
  8049. int32_t n_kv,
  8050. float kq_scale,
  8051. const llm_build_cb & cb,
  8052. int il) {
  8053. const llama_hparams & hparams = lctx.model.hparams;
  8054. const llama_cparams & cparams = lctx.cparams;
  8055. // these nodes are added to the graph together so that they are not reordered
  8056. // by doing so, the number of splits in the graph is reduced
  8057. ggml_build_forward_expand(graph, q_cur);
  8058. ggml_build_forward_expand(graph, k_cur);
  8059. ggml_build_forward_expand(graph, v_cur);
  8060. llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il);
  8061. struct ggml_tensor * cur;
  8062. cur = llm_build_kqv(ctx, lctx, kv, graph, wo, wo_b, q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
  8063. cb(cur, "kqv_out", il);
  8064. return cur;
  8065. }
  8066. static struct ggml_tensor * llm_build_copy_mask_state(
  8067. struct ggml_context * ctx,
  8068. struct ggml_cgraph * graph,
  8069. struct ggml_tensor * s,
  8070. struct ggml_tensor * state_copy,
  8071. struct ggml_tensor * state_mask,
  8072. int32_t n_state,
  8073. int32_t kv_size,
  8074. int32_t kv_head,
  8075. int32_t n_kv,
  8076. int32_t n_seqs) {
  8077. struct ggml_tensor * states = ggml_reshape_2d(ctx, s, n_state, kv_size);
  8078. // copy states
  8079. // NOTE: assuming the copy destinations are ALL contained between kv_head and kv_head + n_kv
  8080. // this shrinks the tensors's ne[1] to n_kv
  8081. states = ggml_get_rows(ctx, states, state_copy);
  8082. // clear states of sequences which are starting at the beginning of this batch
  8083. // FIXME: zero-out NANs?
  8084. states = ggml_mul(ctx, states, state_mask);
  8085. // copy states which won't be changed further (between n_seqs and n_rs)
  8086. ggml_build_forward_expand(graph,
  8087. ggml_cpy(ctx,
  8088. ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
  8089. ggml_view_1d(ctx, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s))));
  8090. // the part of the states that will be used and modified
  8091. return ggml_view_2d(ctx, states, n_state, n_seqs, states->nb[1], 0);
  8092. }
  8093. // TODO: split
  8094. static struct ggml_tensor * llm_build_mamba(
  8095. struct ggml_context * ctx,
  8096. struct llama_context & lctx,
  8097. const llama_ubatch & batch,
  8098. struct ggml_cgraph * graph,
  8099. struct ggml_tensor * cur,
  8100. struct ggml_tensor * state_copy,
  8101. struct ggml_tensor * state_mask,
  8102. int32_t kv_head,
  8103. int32_t n_kv,
  8104. const llm_build_cb & cb,
  8105. int il) {
  8106. const llama_model & model = lctx.model;
  8107. const llama_hparams & hparams = model.hparams;
  8108. const llama_kv_cache & kv = lctx.kv_self;
  8109. const int64_t d_conv = hparams.ssm_d_conv;
  8110. const int64_t d_inner = hparams.ssm_d_inner;
  8111. const int64_t d_state = hparams.ssm_d_state;
  8112. const int64_t dt_rank = hparams.ssm_dt_rank;
  8113. const int64_t n_seqs = batch.n_seqs;
  8114. // Some variants of Mamba arch (e.g. FalconMamba do apply layer norm on B and Dt layers)
  8115. const bool ssm_dt_b_c_rms = hparams.ssm_dt_b_c_rms;
  8116. // Use the same RMS norm as the final layer norm
  8117. const float norm_rms_eps = hparams.f_norm_rms_eps;
  8118. const int64_t n_seq_tokens = batch.n_seq_tokens;
  8119. GGML_ASSERT(n_seqs != 0);
  8120. GGML_ASSERT(batch.equal_seqs);
  8121. GGML_ASSERT(batch.n_tokens == n_seq_tokens * n_seqs);
  8122. struct ggml_tensor * conv_states_all = kv.k_l[il];
  8123. struct ggml_tensor * ssm_states_all = kv.v_l[il];
  8124. // (ab)using the KV cache to store the states
  8125. struct ggml_tensor * conv = llm_build_copy_mask_state(ctx,
  8126. graph, conv_states_all, state_copy, state_mask,
  8127. hparams.n_embd_k_s(), kv.size, kv_head, n_kv, n_seqs);
  8128. conv = ggml_reshape_3d(ctx, conv, d_conv - 1, d_inner, n_seqs);
  8129. struct ggml_tensor * ssm = llm_build_copy_mask_state(ctx,
  8130. graph, ssm_states_all, state_copy, state_mask,
  8131. hparams.n_embd_v_s(), kv.size, kv_head, n_kv, n_seqs);
  8132. ssm = ggml_reshape_3d(ctx, ssm, d_state, d_inner, n_seqs);
  8133. // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
  8134. cur = ggml_reshape_3d(ctx, cur, cur->ne[0], n_seq_tokens, n_seqs);
  8135. // {n_embd, 2*d_inner} @ {n_embd, n_seq_tokens, n_seqs} => {2*d_inner, n_seq_tokens, n_seqs}
  8136. struct ggml_tensor * xz = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_in, cur);
  8137. // split the above in two
  8138. // => {d_inner, n_seq_tokens, n_seqs}
  8139. struct ggml_tensor * x = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], 0);
  8140. struct ggml_tensor * z = ggml_view_3d(ctx, xz, d_inner, xz->ne[1], xz->ne[2], xz->nb[1], xz->nb[2], d_inner*ggml_element_size(xz));
  8141. // conv
  8142. {
  8143. // => {d_conv - 1 + n_seq_tokens, d_inner, n_seqs}
  8144. struct ggml_tensor * conv_x = ggml_concat(ctx, conv, ggml_transpose(ctx, x), 0);
  8145. // copy last (d_conv - 1) columns back into the state cache
  8146. struct ggml_tensor * last_conv = ggml_view_3d(ctx, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0]));
  8147. ggml_build_forward_expand(graph,
  8148. ggml_cpy(ctx, last_conv,
  8149. ggml_view_1d(ctx, conv_states_all,
  8150. (d_conv - 1)*(d_inner)*(n_seqs),
  8151. kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
  8152. // 1D convolution
  8153. // The equivalent is to make a self-overlapping view of conv_x
  8154. // over d_conv columns at each stride in the 3rd dimension,
  8155. // then element-wise multiply that with the conv1d weight,
  8156. // then sum the elements of each row,
  8157. // (the last two steps are a dot product over rows (also doable with mul_mat))
  8158. // then permute away the ne[0] dimension,
  8159. // and then you're left with the resulting x tensor.
  8160. // For simultaneous sequences, all sequences need to have the same length.
  8161. x = ggml_ssm_conv(ctx, conv_x, model.layers[il].ssm_conv1d);
  8162. // bias
  8163. x = ggml_add(ctx, x, model.layers[il].ssm_conv1d_b);
  8164. x = ggml_silu(ctx, x);
  8165. }
  8166. // ssm
  8167. {
  8168. // {d_inner, dt_rank + 2*d_state} @ {d_inner, n_seq_tokens, n_seqs} => {dt_rank + 2*d_state, n_seq_tokens, n_seqs}
  8169. struct ggml_tensor * x_db = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_x, x);
  8170. // split
  8171. struct ggml_tensor * dt = ggml_view_3d(ctx, x_db, dt_rank, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], 0);
  8172. struct ggml_tensor * B = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*dt_rank);
  8173. struct ggml_tensor * C = ggml_view_3d(ctx, x_db, d_state, n_seq_tokens, n_seqs, x_db->nb[1], x_db->nb[2], ggml_element_size(x_db)*(dt_rank+d_state));
  8174. // Some Mamba variants (e.g. FalconMamba) apply RMS norm in B, C & Dt layers
  8175. if (ssm_dt_b_c_rms) {
  8176. dt = ggml_rms_norm(ctx, dt, norm_rms_eps);
  8177. B = ggml_rms_norm(ctx, B, norm_rms_eps);
  8178. C = ggml_rms_norm(ctx, C, norm_rms_eps);
  8179. }
  8180. // {dt_rank, d_inner} @ {dt_rank, n_seq_tokens, n_seqs} => {d_inner, n_seq_tokens, n_seqs}
  8181. dt = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_dt, dt);
  8182. dt = ggml_add(ctx, dt, model.layers[il].ssm_dt_b);
  8183. // Custom operator to optimize the parallel associative scan
  8184. // as described in the Annex D of the Mamba paper.
  8185. // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs}
  8186. struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx, ssm, x, dt, model.layers[il].ssm_a, B, C);
  8187. // store last states
  8188. ggml_build_forward_expand(graph,
  8189. ggml_cpy(ctx,
  8190. ggml_view_1d(ctx, y_ssm, d_state*d_inner*n_seqs, x->nb[3]),
  8191. ggml_view_1d(ctx, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
  8192. struct ggml_tensor * y = ggml_view_3d(ctx, y_ssm, d_inner, n_seq_tokens, n_seqs, x->nb[1], x->nb[2], 0);
  8193. // TODO: skip computing output earlier for unused tokens
  8194. // {d_inner, n_seq_tokens, n_seqs} * {d_inner} => {d_inner, n_seq_tokens, n_seqs}
  8195. y = ggml_add(ctx, y, ggml_mul(ctx, x, model.layers[il].ssm_d));
  8196. y = ggml_mul(ctx, y, ggml_silu(ctx, ggml_cont(ctx, z)));
  8197. // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs}
  8198. cur = llm_build_lora_mm(lctx, ctx, model.layers[il].ssm_out, y);
  8199. }
  8200. // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
  8201. cur = ggml_reshape_2d(ctx, cur, cur->ne[0], n_seq_tokens * n_seqs);
  8202. cb(cur, "mamba_out", il);
  8203. return cur;
  8204. }
  8205. static struct ggml_tensor * llm_build_rwkv6_time_mix(
  8206. struct llama_context & lctx,
  8207. struct ggml_context * ctx,
  8208. const struct llama_layer * layer,
  8209. struct ggml_tensor * cur,
  8210. struct ggml_tensor * x_prev,
  8211. struct ggml_tensor ** wkv_state) {
  8212. size_t n_embed = cur->ne[0];
  8213. size_t n_seq_tokens = cur->ne[1];
  8214. size_t n_seqs = cur->ne[2];
  8215. size_t head_size = layer->time_mix_first->ne[0];
  8216. size_t head_count = layer->time_mix_first->ne[1];
  8217. size_t n_tokens = n_seqs * n_seq_tokens;
  8218. struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
  8219. sx = ggml_reshape_2d(ctx, sx, n_embed, n_tokens);
  8220. cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
  8221. struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
  8222. xxx = ggml_reshape_4d(
  8223. ctx,
  8224. ggml_tanh(
  8225. ctx,
  8226. ggml_mul_mat(ctx, layer->time_mix_w1, xxx)
  8227. ),
  8228. layer->time_mix_w1->ne[1] / 5, 1, 5, n_tokens
  8229. );
  8230. xxx = ggml_cont(ctx, ggml_permute(ctx, xxx, 0, 1, 3, 2));
  8231. xxx = ggml_mul_mat(
  8232. ctx,
  8233. ggml_reshape_4d(
  8234. ctx,
  8235. layer->time_mix_w2,
  8236. layer->time_mix_w2->ne[0], layer->time_mix_w2->ne[1], 1, 5
  8237. ),
  8238. xxx
  8239. );
  8240. struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
  8241. struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
  8242. struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
  8243. struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
  8244. struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
  8245. struct ggml_tensor * xw = ggml_add(
  8246. ctx,
  8247. ggml_mul(
  8248. ctx,
  8249. ggml_add(ctx, mw, layer->time_mix_lerp_w),
  8250. sx
  8251. ),
  8252. cur
  8253. );
  8254. struct ggml_tensor * xk = ggml_add(
  8255. ctx,
  8256. ggml_mul(
  8257. ctx,
  8258. ggml_add(ctx, mk, layer->time_mix_lerp_k),
  8259. sx
  8260. ),
  8261. cur
  8262. );
  8263. struct ggml_tensor * xv = ggml_add(
  8264. ctx,
  8265. ggml_mul(
  8266. ctx,
  8267. ggml_add(ctx, mv, layer->time_mix_lerp_v),
  8268. sx
  8269. ),
  8270. cur
  8271. );
  8272. struct ggml_tensor * xr = ggml_add(
  8273. ctx,
  8274. ggml_mul(
  8275. ctx,
  8276. ggml_add(ctx, mr, layer->time_mix_lerp_r),
  8277. sx
  8278. ),
  8279. cur
  8280. );
  8281. struct ggml_tensor * xg = ggml_add(
  8282. ctx,
  8283. ggml_mul(
  8284. ctx,
  8285. ggml_add(ctx, mg, layer->time_mix_lerp_g),
  8286. sx
  8287. ),
  8288. cur
  8289. );
  8290. struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
  8291. struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
  8292. struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
  8293. struct ggml_tensor * g = ggml_silu(
  8294. ctx,
  8295. llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
  8296. );
  8297. struct ggml_tensor * w = ggml_mul_mat(
  8298. ctx,
  8299. layer->time_mix_decay_w2,
  8300. ggml_tanh(
  8301. ctx,
  8302. ggml_mul_mat(ctx, layer->time_mix_decay_w1, xw)
  8303. )
  8304. );
  8305. w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
  8306. w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
  8307. w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
  8308. k = ggml_transpose(ctx, k);
  8309. v = ggml_transpose(ctx, v);
  8310. r = ggml_transpose(ctx, r);
  8311. struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
  8312. cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
  8313. *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
  8314. // group norm with head_count groups
  8315. cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
  8316. cur = ggml_norm(ctx, cur, 64e-5f);
  8317. // Convert back to regular vectors.
  8318. cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
  8319. cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
  8320. cur = ggml_mul(ctx, cur, g);
  8321. cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
  8322. return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
  8323. }
  8324. static struct ggml_tensor * llm_build_rwkv6_channel_mix(
  8325. struct llama_context & lctx,
  8326. struct ggml_context * ctx,
  8327. const struct llama_layer * layer,
  8328. struct ggml_tensor * cur,
  8329. struct ggml_tensor * x_prev) {
  8330. struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
  8331. struct ggml_tensor * xk = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_k), cur);
  8332. struct ggml_tensor * xr = ggml_add(ctx, ggml_mul(ctx, sx, layer->channel_mix_lerp_r), cur);
  8333. struct ggml_tensor * r = ggml_sigmoid(ctx, llm_build_lora_mm(lctx, ctx, layer->channel_mix_receptance, xr));
  8334. struct ggml_tensor * k = ggml_sqr(
  8335. ctx,
  8336. ggml_relu(
  8337. ctx,
  8338. llm_build_lora_mm(lctx, ctx, layer->channel_mix_key, xk)
  8339. )
  8340. );
  8341. return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
  8342. }
  8343. struct llm_build_context {
  8344. const llama_model & model;
  8345. llama_context & lctx;
  8346. const llama_hparams & hparams;
  8347. const llama_cparams & cparams;
  8348. const llama_ubatch & batch;
  8349. const llama_kv_cache & kv_self;
  8350. const int64_t n_embd;
  8351. const int64_t n_layer;
  8352. const int64_t n_rot;
  8353. const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
  8354. const int64_t n_head;
  8355. const int64_t n_head_kv;
  8356. const int64_t n_embd_head_k;
  8357. const int64_t n_embd_k_gqa;
  8358. const int64_t n_embd_head_v;
  8359. const int64_t n_embd_v_gqa;
  8360. const int64_t n_expert;
  8361. const int64_t n_expert_used;
  8362. const float freq_base;
  8363. const float freq_scale;
  8364. const float ext_factor;
  8365. const float attn_factor;
  8366. const float beta_fast;
  8367. const float beta_slow;
  8368. const float norm_eps;
  8369. const float norm_rms_eps;
  8370. const int32_t n_tokens;
  8371. const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size)
  8372. const int32_t n_outputs;
  8373. const int32_t n_outputs_enc;
  8374. const int32_t kv_head; // index of where we store new KV data in the cache
  8375. const int32_t n_ctx_orig;
  8376. const bool flash_attn;
  8377. const enum llama_pooling_type pooling_type;
  8378. const enum llama_rope_type rope_type;
  8379. const llm_build_cb & cb;
  8380. std::vector<uint8_t> & buf_compute_meta;
  8381. struct ggml_context * ctx0 = nullptr;
  8382. // TODO: consider making the entire interface noexcept
  8383. llm_build_context(
  8384. llama_context & lctx,
  8385. const llama_ubatch & batch,
  8386. const llm_build_cb & cb,
  8387. bool worst_case) :
  8388. model (lctx.model),
  8389. lctx (lctx),
  8390. hparams (model.hparams),
  8391. cparams (lctx.cparams),
  8392. batch (batch),
  8393. kv_self (lctx.kv_self),
  8394. n_embd (hparams.n_embd),
  8395. n_layer (hparams.n_layer),
  8396. n_rot (hparams.n_rot),
  8397. n_ctx (cparams.n_ctx),
  8398. n_head (hparams.n_head()),
  8399. n_head_kv (hparams.n_head_kv()),
  8400. n_embd_head_k (hparams.n_embd_head_k),
  8401. n_embd_k_gqa (hparams.n_embd_k_gqa()),
  8402. n_embd_head_v (hparams.n_embd_head_v),
  8403. n_embd_v_gqa (hparams.n_embd_v_gqa()),
  8404. n_expert (hparams.n_expert),
  8405. n_expert_used (hparams.n_expert_used),
  8406. freq_base (cparams.rope_freq_base),
  8407. freq_scale (cparams.rope_freq_scale),
  8408. ext_factor (cparams.yarn_ext_factor),
  8409. attn_factor (cparams.yarn_attn_factor),
  8410. beta_fast (cparams.yarn_beta_fast),
  8411. beta_slow (cparams.yarn_beta_slow),
  8412. norm_eps (hparams.f_norm_eps),
  8413. norm_rms_eps (hparams.f_norm_rms_eps),
  8414. n_tokens (batch.n_tokens),
  8415. n_kv (worst_case ? kv_self.size : kv_self.n),
  8416. n_outputs (worst_case ? n_tokens : lctx.n_outputs),
  8417. n_outputs_enc (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
  8418. kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
  8419. n_ctx_orig (cparams.n_ctx_orig_yarn),
  8420. flash_attn (cparams.flash_attn),
  8421. pooling_type (cparams.pooling_type),
  8422. rope_type (hparams.rope_type),
  8423. cb (cb),
  8424. buf_compute_meta (lctx.buf_compute_meta) {
  8425. // all initializations should be done in init()
  8426. }
  8427. void init() {
  8428. struct ggml_init_params params = {
  8429. /*.mem_size =*/ buf_compute_meta.size(),
  8430. /*.mem_buffer =*/ buf_compute_meta.data(),
  8431. /*.no_alloc =*/ true,
  8432. };
  8433. ctx0 = ggml_init(params);
  8434. lctx.inp_tokens = nullptr;
  8435. lctx.inp_embd = nullptr;
  8436. lctx.inp_pos = nullptr;
  8437. lctx.inp_out_ids = nullptr;
  8438. lctx.inp_KQ_mask = nullptr;
  8439. lctx.inp_KQ_mask_swa = nullptr;
  8440. lctx.inp_K_shift = nullptr;
  8441. lctx.inp_mean = nullptr;
  8442. lctx.inp_cls = nullptr;
  8443. lctx.inp_s_copy = nullptr;
  8444. lctx.inp_s_mask = nullptr;
  8445. lctx.inp_s_seq = nullptr;
  8446. lctx.inp_pos_bucket = nullptr;
  8447. lctx.inp_embd_enc = nullptr;
  8448. lctx.inp_KQ_mask_cross = nullptr;
  8449. }
  8450. void free() {
  8451. if (ctx0) {
  8452. ggml_free(ctx0);
  8453. ctx0 = nullptr;
  8454. }
  8455. }
  8456. struct ggml_cgraph * build_k_shift() {
  8457. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8458. GGML_ASSERT(kv_self.size == n_ctx);
  8459. lctx.inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
  8460. cb(lctx.inp_K_shift, "K_shift", -1);
  8461. ggml_set_input(lctx.inp_K_shift);
  8462. for (int il = 0; il < n_layer; ++il) {
  8463. const int64_t n_head_kv = hparams.n_head_kv(il);
  8464. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  8465. struct ggml_tensor * rope_factors = build_rope_factors(il);
  8466. struct ggml_tensor * tmp =
  8467. // we rotate only the first n_rot dimensions
  8468. ggml_rope_ext_inplace(ctx0,
  8469. ggml_view_3d(ctx0, kv_self.k_l[il],
  8470. n_embd_head_k, n_head_kv, n_ctx,
  8471. ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
  8472. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  8473. 0),
  8474. lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8475. ext_factor, attn_factor, beta_fast, beta_slow);
  8476. cb(tmp, "K_shifted", il);
  8477. ggml_build_forward_expand(gf, tmp);
  8478. }
  8479. return gf;
  8480. }
  8481. struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
  8482. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8483. for (uint32_t i = 0; i < ids.size(); ++i) {
  8484. const uint32_t id = ids[i];
  8485. if (i == id || id == ids.size()) {
  8486. continue;
  8487. }
  8488. uint32_t nm = 1;
  8489. while (i + nm < ids.size() && ids[i + nm] == id + nm) {
  8490. nm++;
  8491. }
  8492. for (int il = 0; il < n_layer; ++il) {
  8493. const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
  8494. const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
  8495. ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
  8496. n_embd_k_gqa, nm,
  8497. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  8498. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
  8499. ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
  8500. n_embd_k_gqa, nm,
  8501. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  8502. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
  8503. ggml_tensor * view_v_src;
  8504. ggml_tensor * view_v_dst;
  8505. if (flash_attn) {
  8506. // NOTE: the V cache is not transposed when using flash attention
  8507. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  8508. n_embd_v_gqa, nm,
  8509. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  8510. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
  8511. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  8512. n_embd_v_gqa, nm,
  8513. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
  8514. ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
  8515. } else {
  8516. view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
  8517. nm, n_embd_v_gqa,
  8518. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  8519. ggml_row_size(kv_self.v_l[il]->type, i));
  8520. view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
  8521. nm, n_embd_v_gqa,
  8522. ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
  8523. ggml_row_size(kv_self.v_l[il]->type, id));
  8524. }
  8525. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
  8526. ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
  8527. }
  8528. i += nm - 1;
  8529. }
  8530. //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
  8531. return gf;
  8532. }
  8533. struct ggml_tensor * build_inp_pos() {
  8534. lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  8535. cb(lctx.inp_pos, "inp_pos", -1);
  8536. ggml_set_input(lctx.inp_pos);
  8537. return lctx.inp_pos;
  8538. }
  8539. struct ggml_tensor * build_rope_factors(int il) {
  8540. // choose long/short freq factors based on the context size
  8541. const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
  8542. if (model.layers[il].rope_freqs != nullptr) {
  8543. return model.layers[il].rope_freqs;
  8544. }
  8545. if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
  8546. return model.layers[il].rope_long;
  8547. }
  8548. return model.layers[il].rope_short;
  8549. }
  8550. struct ggml_tensor * build_inp_out_ids() {
  8551. lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
  8552. cb(lctx.inp_out_ids, "inp_out_ids", -1);
  8553. ggml_set_input(lctx.inp_out_ids);
  8554. return lctx.inp_out_ids;
  8555. }
  8556. struct ggml_tensor * build_inp_KQ_mask(bool causal = true) {
  8557. lctx.inp_KQ_mask = causal
  8558. ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
  8559. : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  8560. cb(lctx.inp_KQ_mask, "KQ_mask", -1);
  8561. ggml_set_input(lctx.inp_KQ_mask);
  8562. return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
  8563. }
  8564. struct ggml_tensor * build_inp_KQ_mask_swa(bool causal = true) {
  8565. GGML_ASSERT(hparams.n_swa > 0);
  8566. lctx.inp_KQ_mask_swa = causal
  8567. ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD))
  8568. : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  8569. cb(lctx.inp_KQ_mask_swa, "KQ_mask_swa", -1);
  8570. ggml_set_input(lctx.inp_KQ_mask_swa);
  8571. return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask_swa, GGML_TYPE_F16) : lctx.inp_KQ_mask_swa;
  8572. }
  8573. struct ggml_tensor * build_inp_mean() {
  8574. lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
  8575. cb(lctx.inp_mean, "inp_mean", -1);
  8576. ggml_set_input(lctx.inp_mean);
  8577. return lctx.inp_mean;
  8578. }
  8579. struct ggml_tensor * build_inp_cls() {
  8580. lctx.inp_cls = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
  8581. cb(lctx.inp_cls, "inp_cls", -1);
  8582. ggml_set_input(lctx.inp_cls);
  8583. return lctx.inp_cls;
  8584. }
  8585. struct ggml_tensor * build_inp_s_copy() {
  8586. lctx.inp_s_copy = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_kv);
  8587. cb(lctx.inp_s_copy, "inp_s_copy", -1);
  8588. ggml_set_input(lctx.inp_s_copy);
  8589. return lctx.inp_s_copy;
  8590. }
  8591. struct ggml_tensor * build_inp_s_mask() {
  8592. lctx.inp_s_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_kv);
  8593. cb(lctx.inp_s_mask, "inp_s_mask", -1);
  8594. ggml_set_input(lctx.inp_s_mask);
  8595. return lctx.inp_s_mask;
  8596. }
  8597. struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
  8598. // find result_norm tensor for input
  8599. struct ggml_tensor * inp = nullptr;
  8600. for (int i = gf->n_nodes - 1; i >= 0; --i) {
  8601. inp = gf->nodes[i];
  8602. if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
  8603. break;
  8604. } else {
  8605. inp = nullptr;
  8606. }
  8607. }
  8608. GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
  8609. struct ggml_tensor * cur;
  8610. switch (pooling_type) {
  8611. case LLAMA_POOLING_TYPE_MEAN:
  8612. {
  8613. struct ggml_tensor * inp_mean = build_inp_mean();
  8614. cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
  8615. } break;
  8616. case LLAMA_POOLING_TYPE_CLS:
  8617. case LLAMA_POOLING_TYPE_LAST:
  8618. {
  8619. struct ggml_tensor * inp_cls = build_inp_cls();
  8620. cur = ggml_get_rows(ctx0, inp, inp_cls);
  8621. } break;
  8622. case LLAMA_POOLING_TYPE_NONE:
  8623. {
  8624. cur = inp;
  8625. } break;
  8626. default:
  8627. {
  8628. GGML_ABORT("unknown pooling type");
  8629. }
  8630. }
  8631. cb(cur, "result_embd_pooled", -1);
  8632. ggml_build_forward_expand(gf, cur);
  8633. return gf;
  8634. }
  8635. struct ggml_tensor * llm_build_pos_bucket(bool causal) {
  8636. if (causal) {
  8637. lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_kv, n_tokens);
  8638. } else {
  8639. lctx.inp_pos_bucket = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_tokens, n_tokens);
  8640. }
  8641. ggml_set_input(lctx.inp_pos_bucket);
  8642. cb(lctx.inp_pos_bucket, "pos_bucket", -1);
  8643. return lctx.inp_pos_bucket;
  8644. }
  8645. struct ggml_tensor * llm_build_pos_bias(struct ggml_tensor * pos_bucket, struct ggml_tensor * attn_rel_b) {
  8646. struct ggml_tensor * pos_bucket_1d = ggml_view_1d(ctx0, pos_bucket, pos_bucket->ne[0] * pos_bucket->ne[1], 0);
  8647. cb(pos_bucket_1d, "pos_bucket_1d", -1);
  8648. struct ggml_tensor * pos_bias = ggml_get_rows(ctx0, attn_rel_b, pos_bucket_1d);
  8649. cb(pos_bias, "pos_bias", -1);
  8650. pos_bias = ggml_view_3d(ctx0, pos_bias, pos_bias->ne[0], lctx.inp_pos_bucket->ne[0], lctx.inp_pos_bucket->ne[1], ggml_element_size(pos_bias) * pos_bias->ne[0], ggml_element_size(pos_bias) * pos_bias->ne[0] * lctx.inp_pos_bucket->ne[0], 0);
  8651. cb(pos_bias, "pos_bias", -1);
  8652. pos_bias = ggml_permute(ctx0, pos_bias, 2, 0, 1, 3);
  8653. cb(pos_bias, "pos_bias", -1);
  8654. pos_bias = ggml_cont(ctx0, pos_bias);
  8655. cb(pos_bias, "pos_bias", -1);
  8656. return pos_bias;
  8657. }
  8658. struct ggml_tensor * llm_build_inp_embd_enc() {
  8659. const int64_t n_embd = hparams.n_embd;
  8660. lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
  8661. ggml_set_input(lctx.inp_embd_enc);
  8662. cb(lctx.inp_embd_enc, "embd_enc", -1);
  8663. return lctx.inp_embd_enc;
  8664. }
  8665. struct ggml_tensor * llm_build_inp_KQ_mask_cross() {
  8666. lctx.inp_KQ_mask_cross = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_outputs_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
  8667. ggml_set_input(lctx.inp_KQ_mask_cross);
  8668. cb(lctx.inp_KQ_mask_cross, "KQ_mask_cross", -1);
  8669. return lctx.inp_KQ_mask_cross;
  8670. }
  8671. struct ggml_cgraph * build_llama() {
  8672. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8673. // mutable variable, needed during the last layer of the computation to skip unused tokens
  8674. int32_t n_tokens = this->n_tokens;
  8675. const int64_t n_embd_head = hparams.n_embd_head_v;
  8676. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8677. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8678. struct ggml_tensor * cur;
  8679. struct ggml_tensor * inpL;
  8680. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  8681. // inp_pos - contains the positions
  8682. struct ggml_tensor * inp_pos = build_inp_pos();
  8683. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8684. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8685. for (int il = 0; il < n_layer; ++il) {
  8686. struct ggml_tensor * inpSA = inpL;
  8687. // norm
  8688. cur = llm_build_norm(ctx0, inpL, hparams,
  8689. model.layers[il].attn_norm, NULL,
  8690. LLM_NORM_RMS, cb, il);
  8691. cb(cur, "attn_norm", il);
  8692. // self-attention
  8693. {
  8694. // rope freq factors for llama3; may return nullptr for llama2 and other models
  8695. struct ggml_tensor * rope_factors = build_rope_factors(il);
  8696. // compute Q and K and RoPE them
  8697. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8698. cb(Qcur, "Qcur", il);
  8699. if (model.layers[il].bq) {
  8700. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  8701. cb(Qcur, "Qcur", il);
  8702. }
  8703. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8704. cb(Kcur, "Kcur", il);
  8705. if (model.layers[il].bk) {
  8706. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  8707. cb(Kcur, "Kcur", il);
  8708. }
  8709. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8710. cb(Vcur, "Vcur", il);
  8711. if (model.layers[il].bv) {
  8712. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  8713. cb(Vcur, "Vcur", il);
  8714. }
  8715. Qcur = ggml_rope_ext(
  8716. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  8717. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8718. ext_factor, attn_factor, beta_fast, beta_slow
  8719. );
  8720. cb(Qcur, "Qcur", il);
  8721. Kcur = ggml_rope_ext(
  8722. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  8723. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8724. ext_factor, attn_factor, beta_fast, beta_slow
  8725. );
  8726. cb(Kcur, "Kcur", il);
  8727. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8728. model.layers[il].wo, model.layers[il].bo,
  8729. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8730. }
  8731. if (il == n_layer - 1) {
  8732. // skip computing output for unused tokens
  8733. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8734. n_tokens = n_outputs;
  8735. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8736. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8737. }
  8738. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8739. cb(ffn_inp, "ffn_inp", il);
  8740. // feed-forward network
  8741. if (model.layers[il].ffn_gate_inp == nullptr) {
  8742. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8743. model.layers[il].ffn_norm, NULL,
  8744. LLM_NORM_RMS, cb, il);
  8745. cb(cur, "ffn_norm", il);
  8746. cur = llm_build_ffn(ctx0, lctx, cur,
  8747. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  8748. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  8749. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  8750. NULL,
  8751. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8752. cb(cur, "ffn_out", il);
  8753. } else {
  8754. // MoE branch
  8755. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8756. model.layers[il].ffn_norm, NULL,
  8757. LLM_NORM_RMS, cb, il);
  8758. cb(cur, "ffn_norm", il);
  8759. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  8760. model.layers[il].ffn_gate_inp,
  8761. model.layers[il].ffn_up_exps,
  8762. model.layers[il].ffn_gate_exps,
  8763. model.layers[il].ffn_down_exps,
  8764. n_expert, n_expert_used,
  8765. LLM_FFN_SILU, true,
  8766. false, 0.0,
  8767. cb, il);
  8768. cb(cur, "ffn_moe_out", il);
  8769. }
  8770. cur = ggml_add(ctx0, cur, ffn_inp);
  8771. cb(cur, "ffn_out", il);
  8772. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8773. cb(cur, "l_out", il);
  8774. // input for next layer
  8775. inpL = cur;
  8776. }
  8777. cur = inpL;
  8778. cur = llm_build_norm(ctx0, cur, hparams,
  8779. model.output_norm, NULL,
  8780. LLM_NORM_RMS, cb, -1);
  8781. cb(cur, "result_norm", -1);
  8782. // lm_head
  8783. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8784. cb(cur, "result_output", -1);
  8785. ggml_build_forward_expand(gf, cur);
  8786. return gf;
  8787. }
  8788. struct ggml_cgraph * build_baichuan() {
  8789. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8790. const int64_t n_embd_head = hparams.n_embd_head_v;
  8791. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8792. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8793. struct ggml_tensor * cur;
  8794. struct ggml_tensor * inpL;
  8795. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  8796. // inp_pos - contains the positions
  8797. struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
  8798. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8799. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8800. for (int il = 0; il < n_layer; ++il) {
  8801. struct ggml_tensor * inpSA = inpL;
  8802. cur = llm_build_norm(ctx0, inpL, hparams,
  8803. model.layers[il].attn_norm, NULL,
  8804. LLM_NORM_RMS, cb, il);
  8805. cb(cur, "attn_norm", il);
  8806. // self-attention
  8807. {
  8808. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8809. cb(Qcur, "Qcur", il);
  8810. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8811. cb(Kcur, "Kcur", il);
  8812. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8813. cb(Vcur, "Vcur", il);
  8814. switch (model.type) {
  8815. case MODEL_7B:
  8816. Qcur = ggml_rope_ext(
  8817. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  8818. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8819. ext_factor, attn_factor, beta_fast, beta_slow
  8820. );
  8821. Kcur = ggml_rope_ext(
  8822. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  8823. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8824. ext_factor, attn_factor, beta_fast, beta_slow
  8825. );
  8826. break;
  8827. case MODEL_13B:
  8828. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
  8829. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens);
  8830. break;
  8831. default:
  8832. GGML_ABORT("fatal error");
  8833. }
  8834. cb(Qcur, "Qcur", il);
  8835. cb(Kcur, "Kcur", il);
  8836. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8837. model.layers[il].wo, NULL,
  8838. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8839. }
  8840. if (il == n_layer - 1) {
  8841. // skip computing output for unused tokens
  8842. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8843. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8844. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8845. }
  8846. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8847. cb(ffn_inp, "ffn_inp", il);
  8848. // feed-forward network
  8849. {
  8850. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8851. model.layers[il].ffn_norm, NULL,
  8852. LLM_NORM_RMS, cb, il);
  8853. cb(cur, "ffn_norm", il);
  8854. cur = llm_build_ffn(ctx0, lctx, cur,
  8855. model.layers[il].ffn_up, NULL, NULL,
  8856. model.layers[il].ffn_gate, NULL, NULL,
  8857. model.layers[il].ffn_down, NULL, NULL,
  8858. NULL,
  8859. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8860. cb(cur, "ffn_out", il);
  8861. }
  8862. cur = ggml_add(ctx0, cur, ffn_inp);
  8863. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8864. cb(cur, "l_out", il);
  8865. // input for next layer
  8866. inpL = cur;
  8867. }
  8868. cur = inpL;
  8869. cur = llm_build_norm(ctx0, cur, hparams,
  8870. model.output_norm, NULL,
  8871. LLM_NORM_RMS, cb, -1);
  8872. cb(cur, "result_norm", -1);
  8873. // lm_head
  8874. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8875. cb(cur, "result_output", -1);
  8876. ggml_build_forward_expand(gf, cur);
  8877. return gf;
  8878. }
  8879. struct ggml_cgraph * build_xverse() {
  8880. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8881. const int64_t n_embd_head = hparams.n_embd_head_v;
  8882. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8883. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8884. struct ggml_tensor * cur;
  8885. struct ggml_tensor * inpL;
  8886. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  8887. // inp_pos - contains the positions
  8888. struct ggml_tensor * inp_pos = build_inp_pos();
  8889. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8890. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8891. for (int il = 0; il < n_layer; ++il) {
  8892. struct ggml_tensor * inpSA = inpL;
  8893. cur = llm_build_norm(ctx0, inpL, hparams,
  8894. model.layers[il].attn_norm, NULL,
  8895. LLM_NORM_RMS, cb, il);
  8896. cb(cur, "attn_norm", il);
  8897. // self-attention
  8898. {
  8899. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  8900. cb(Qcur, "Qcur", il);
  8901. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  8902. cb(Kcur, "Kcur", il);
  8903. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  8904. cb(Vcur, "Vcur", il);
  8905. Qcur = ggml_rope_ext(
  8906. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  8907. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8908. ext_factor, attn_factor, beta_fast, beta_slow
  8909. );
  8910. cb(Qcur, "Qcur", il);
  8911. Kcur = ggml_rope_ext(
  8912. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  8913. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  8914. ext_factor, attn_factor, beta_fast, beta_slow
  8915. );
  8916. cb(Kcur, "Kcur", il);
  8917. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  8918. model.layers[il].wo, NULL,
  8919. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  8920. }
  8921. if (il == n_layer - 1) {
  8922. // skip computing output for unused tokens
  8923. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  8924. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  8925. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  8926. }
  8927. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  8928. cb(ffn_inp, "ffn_inp", il);
  8929. // feed-forward network
  8930. {
  8931. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  8932. model.layers[il].ffn_norm, NULL,
  8933. LLM_NORM_RMS, cb, il);
  8934. cb(cur, "ffn_norm", il);
  8935. cur = llm_build_ffn(ctx0, lctx, cur,
  8936. model.layers[il].ffn_up, NULL, NULL,
  8937. model.layers[il].ffn_gate, NULL, NULL,
  8938. model.layers[il].ffn_down, NULL, NULL,
  8939. NULL,
  8940. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  8941. cb(cur, "ffn_out", il);
  8942. }
  8943. cur = ggml_add(ctx0, cur, ffn_inp);
  8944. cur = lctx.cvec.apply_to(ctx0, cur, il);
  8945. cb(cur, "l_out", il);
  8946. // input for next layer
  8947. inpL = cur;
  8948. }
  8949. cur = inpL;
  8950. cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1);
  8951. cb(cur, "result_norm", -1);
  8952. // lm_head
  8953. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  8954. cb(cur, "result_output", -1);
  8955. ggml_build_forward_expand(gf, cur);
  8956. return gf;
  8957. }
  8958. struct ggml_cgraph * build_falcon() {
  8959. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  8960. const int64_t n_embd_head = hparams.n_embd_head_v;
  8961. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  8962. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  8963. GGML_ASSERT(n_embd_head == hparams.n_rot);
  8964. struct ggml_tensor * cur;
  8965. struct ggml_tensor * inpL;
  8966. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  8967. // inp_pos - contains the positions
  8968. struct ggml_tensor * inp_pos = build_inp_pos();
  8969. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  8970. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  8971. for (int il = 0; il < n_layer; ++il) {
  8972. struct ggml_tensor * attn_norm;
  8973. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  8974. model.layers[il].attn_norm,
  8975. model.layers[il].attn_norm_b,
  8976. LLM_NORM, cb, il);
  8977. cb(attn_norm, "attn_norm", il);
  8978. // self-attention
  8979. {
  8980. if (model.layers[il].attn_norm_2) {
  8981. // Falcon-40B
  8982. cur = llm_build_norm(ctx0, inpL, hparams,
  8983. model.layers[il].attn_norm_2,
  8984. model.layers[il].attn_norm_2_b,
  8985. LLM_NORM, cb, il);
  8986. cb(cur, "attn_norm_2", il);
  8987. } else {
  8988. cur = attn_norm;
  8989. }
  8990. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  8991. cb(cur, "wqkv", il);
  8992. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  8993. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  8994. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  8995. cb(Qcur, "Qcur", il);
  8996. cb(Kcur, "Kcur", il);
  8997. cb(Vcur, "Vcur", il);
  8998. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  8999. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9000. // using mode = 2 for neox mode
  9001. Qcur = ggml_rope_ext(
  9002. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  9003. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  9004. );
  9005. cb(Qcur, "Qcur", il);
  9006. Kcur = ggml_rope_ext(
  9007. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  9008. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  9009. );
  9010. cb(Kcur, "Kcur", il);
  9011. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9012. model.layers[il].wo, NULL,
  9013. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9014. }
  9015. if (il == n_layer - 1) {
  9016. // skip computing output for unused tokens
  9017. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9018. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9019. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9020. attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids);
  9021. }
  9022. struct ggml_tensor * ffn_inp = cur;
  9023. // feed forward
  9024. {
  9025. cur = llm_build_ffn(ctx0, lctx, attn_norm, // !! use the attn norm, not the result
  9026. model.layers[il].ffn_up, NULL, NULL,
  9027. NULL, NULL, NULL,
  9028. model.layers[il].ffn_down, NULL, NULL,
  9029. NULL,
  9030. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  9031. cb(cur, "ffn_out", il);
  9032. }
  9033. cur = ggml_add(ctx0, cur, ffn_inp);
  9034. cur = ggml_add(ctx0, cur, inpL);
  9035. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9036. cb(cur, "l_out", il);
  9037. // input for next layer
  9038. inpL = cur;
  9039. }
  9040. cur = inpL;
  9041. // norm
  9042. cur = llm_build_norm(ctx0, cur, hparams,
  9043. model.output_norm,
  9044. model.output_norm_b,
  9045. LLM_NORM, cb, -1);
  9046. cb(cur, "result_norm", -1);
  9047. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9048. cb(cur, "result_output", -1);
  9049. ggml_build_forward_expand(gf, cur);
  9050. return gf;
  9051. }
  9052. struct ggml_cgraph * build_grok() {
  9053. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9054. // mutable variable, needed during the last layer of the computation to skip unused tokens
  9055. int32_t n_tokens = this->n_tokens;
  9056. const int64_t n_embd_head = hparams.n_embd_head_v;
  9057. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9058. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9059. struct ggml_tensor * cur;
  9060. struct ggml_tensor * inpL;
  9061. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9062. // multiply by embedding_multiplier_scale of 78.38367176906169
  9063. inpL = ggml_scale(ctx0, inpL, 78.38367176906169f);
  9064. // inp_pos - contains the positions
  9065. struct ggml_tensor * inp_pos = build_inp_pos();
  9066. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9067. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9068. for (int il = 0; il < n_layer; ++il) {
  9069. struct ggml_tensor * inpSA = inpL;
  9070. // norm
  9071. cur = llm_build_norm(ctx0, inpL, hparams,
  9072. model.layers[il].attn_norm, NULL,
  9073. LLM_NORM_RMS, cb, il);
  9074. cb(cur, "attn_norm", il);
  9075. // self-attention
  9076. {
  9077. // compute Q and K and RoPE them
  9078. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  9079. cb(Qcur, "Qcur", il);
  9080. if (model.layers[il].bq) {
  9081. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9082. cb(Qcur, "Qcur", il);
  9083. }
  9084. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  9085. cb(Kcur, "Kcur", il);
  9086. if (model.layers[il].bk) {
  9087. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9088. cb(Kcur, "Kcur", il);
  9089. }
  9090. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  9091. cb(Vcur, "Vcur", il);
  9092. if (model.layers[il].bv) {
  9093. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9094. cb(Vcur, "Vcur", il);
  9095. }
  9096. Qcur = ggml_rope_ext(
  9097. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  9098. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9099. ext_factor, attn_factor, beta_fast, beta_slow
  9100. );
  9101. cb(Qcur, "Qcur", il);
  9102. Kcur = ggml_rope_ext(
  9103. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  9104. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9105. ext_factor, attn_factor, beta_fast, beta_slow
  9106. );
  9107. cb(Kcur, "Kcur", il);
  9108. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9109. model.layers[il].wo, model.layers[il].bo,
  9110. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  9111. }
  9112. if (il == n_layer - 1) {
  9113. // skip computing output for unused tokens
  9114. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9115. n_tokens = n_outputs;
  9116. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9117. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9118. }
  9119. // Grok
  9120. // if attn_out_norm is present then apply it before adding the input
  9121. if (model.layers[il].attn_out_norm) {
  9122. cur = llm_build_norm(ctx0, cur, hparams,
  9123. model.layers[il].attn_out_norm, NULL,
  9124. LLM_NORM_RMS, cb, il);
  9125. cb(cur, "attn_out_norm", il);
  9126. }
  9127. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9128. cb(ffn_inp, "ffn_inp", il);
  9129. // feed-forward network
  9130. // MoE branch
  9131. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9132. model.layers[il].ffn_norm, NULL,
  9133. LLM_NORM_RMS, cb, il);
  9134. cb(cur, "ffn_norm", il);
  9135. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  9136. model.layers[il].ffn_gate_inp,
  9137. model.layers[il].ffn_up_exps,
  9138. model.layers[il].ffn_gate_exps,
  9139. model.layers[il].ffn_down_exps,
  9140. n_expert, n_expert_used,
  9141. LLM_FFN_GELU, true,
  9142. false, 0.0,
  9143. cb, il);
  9144. cb(cur, "ffn_moe_out", il);
  9145. // Grok
  9146. // if layer_out_norm is present then apply it before adding the input
  9147. // Idea: maybe ffn_out_norm is a better name
  9148. if (model.layers[il].layer_out_norm) {
  9149. cur = llm_build_norm(ctx0, cur, hparams,
  9150. model.layers[il].layer_out_norm, NULL,
  9151. LLM_NORM_RMS, cb, il);
  9152. cb(cur, "layer_out_norm", il);
  9153. }
  9154. cur = ggml_add(ctx0, cur, ffn_inp);
  9155. cb(cur, "ffn_out", il);
  9156. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9157. cb(cur, "l_out", il);
  9158. // input for next layer
  9159. inpL = cur;
  9160. }
  9161. cur = inpL;
  9162. cur = llm_build_norm(ctx0, cur, hparams,
  9163. model.output_norm, NULL,
  9164. LLM_NORM_RMS, cb, -1);
  9165. cb(cur, "result_norm", -1);
  9166. // lm_head
  9167. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9168. // Grok
  9169. // multiply logits by output_multiplier_scale of 0.5773502691896257
  9170. cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
  9171. cb(cur, "result_output", -1);
  9172. ggml_build_forward_expand(gf, cur);
  9173. return gf;
  9174. }
  9175. struct ggml_cgraph * build_dbrx() {
  9176. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9177. // mutable variable, needed during the last layer of the computation to skip unused tokens
  9178. int32_t n_tokens = this->n_tokens;
  9179. const int64_t n_embd_head = hparams.n_embd_head_v;
  9180. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9181. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9182. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9183. struct ggml_tensor * cur;
  9184. struct ggml_tensor * inpL;
  9185. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9186. // inp_pos - contains the positions
  9187. struct ggml_tensor * inp_pos = build_inp_pos();
  9188. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9189. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9190. for (int il = 0; il < n_layer; ++il) {
  9191. struct ggml_tensor * inpSA = inpL;
  9192. // norm
  9193. cur = llm_build_norm(ctx0, inpL, hparams,
  9194. model.layers[il].attn_norm, NULL,
  9195. LLM_NORM, cb, il);
  9196. cb(cur, "attn_norm", il);
  9197. // self-attention
  9198. {
  9199. struct ggml_tensor * Qcur = nullptr;
  9200. struct ggml_tensor * Kcur = nullptr;
  9201. struct ggml_tensor * Vcur = nullptr;
  9202. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9203. cb(cur, "wqkv", il);
  9204. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9205. cb(cur, "wqkv_clamped", il);
  9206. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9207. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9208. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9209. cb(Qcur, "Qcur", il);
  9210. cb(Kcur, "Kcur", il);
  9211. cb(Vcur, "Vcur", il);
  9212. Qcur = ggml_rope_ext(
  9213. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  9214. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9215. ext_factor, attn_factor, beta_fast, beta_slow
  9216. );
  9217. cb(Qcur, "Qcur", il);
  9218. Kcur = ggml_rope_ext(
  9219. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  9220. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9221. ext_factor, attn_factor, beta_fast, beta_slow
  9222. );
  9223. cb(Kcur, "Kcur", il);
  9224. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9225. model.layers[il].wo, NULL,
  9226. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9227. }
  9228. if (il == n_layer - 1) {
  9229. // skip computing output for unused tokens
  9230. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9231. n_tokens = n_outputs;
  9232. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9233. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9234. }
  9235. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9236. cb(ffn_inp, "ffn_inp", il);
  9237. // feed-forward network
  9238. // MoE branch
  9239. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9240. model.layers[il].attn_out_norm, NULL,
  9241. LLM_NORM, cb, il);
  9242. cb(cur, "attn_out_norm", il);
  9243. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  9244. model.layers[il].ffn_gate_inp,
  9245. model.layers[il].ffn_up_exps,
  9246. model.layers[il].ffn_gate_exps,
  9247. model.layers[il].ffn_down_exps,
  9248. n_expert, n_expert_used,
  9249. LLM_FFN_SILU, true,
  9250. false, 0.0,
  9251. cb, il);
  9252. cb(cur, "ffn_moe_out", il);
  9253. cur = ggml_add(ctx0, cur, ffn_inp);
  9254. cb(cur, "ffn_out", il);
  9255. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9256. cb(cur, "l_out", il);
  9257. // input for next layer
  9258. inpL = cur;
  9259. }
  9260. cur = inpL;
  9261. cur = llm_build_norm(ctx0, cur, hparams,
  9262. model.output_norm, NULL,
  9263. LLM_NORM, cb, -1);
  9264. cb(cur, "result_norm", -1);
  9265. // lm_head
  9266. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9267. cb(cur, "result_output", -1);
  9268. ggml_build_forward_expand(gf, cur);
  9269. return gf;
  9270. }
  9271. struct ggml_cgraph * build_starcoder() {
  9272. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9273. const int64_t n_embd_head = hparams.n_embd_head_v;
  9274. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9275. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9276. struct ggml_tensor * cur;
  9277. struct ggml_tensor * inpL;
  9278. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9279. // inp_pos - contains the positions
  9280. struct ggml_tensor * inp_pos = build_inp_pos();
  9281. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9282. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9283. struct ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  9284. cb(pos, "pos_embd", -1);
  9285. inpL = ggml_add(ctx0, inpL, pos);
  9286. cb(inpL, "inpL", -1);
  9287. for (int il = 0; il < n_layer; ++il) {
  9288. cur = llm_build_norm(ctx0, inpL, hparams,
  9289. model.layers[il].attn_norm,
  9290. model.layers[il].attn_norm_b,
  9291. LLM_NORM, cb, il);
  9292. cb(cur, "attn_norm", il);
  9293. // self-attention
  9294. {
  9295. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9296. cb(cur, "wqkv", il);
  9297. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9298. cb(cur, "bqkv", il);
  9299. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9300. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9301. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9302. cb(Qcur, "Qcur", il);
  9303. cb(Kcur, "Kcur", il);
  9304. cb(Vcur, "Vcur", il);
  9305. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9306. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9307. model.layers[il].wo, model.layers[il].bo,
  9308. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9309. }
  9310. if (il == n_layer - 1) {
  9311. // skip computing output for unused tokens
  9312. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9313. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9314. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9315. }
  9316. // add the input
  9317. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9318. cb(ffn_inp, "ffn_inp", il);
  9319. // FF
  9320. {
  9321. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9322. model.layers[il].ffn_norm,
  9323. model.layers[il].ffn_norm_b,
  9324. LLM_NORM, cb, il);
  9325. cb(cur, "ffn_norm", il);
  9326. cur = llm_build_ffn(ctx0, lctx, cur,
  9327. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9328. NULL, NULL, NULL,
  9329. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9330. NULL,
  9331. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  9332. cb(cur, "ffn_out", il);
  9333. }
  9334. cur = ggml_add(ctx0, cur, ffn_inp);
  9335. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9336. cb(cur, "l_out", il);
  9337. // input for next layer
  9338. inpL = cur;
  9339. }
  9340. cur = llm_build_norm(ctx0, inpL, hparams,
  9341. model.output_norm,
  9342. model.output_norm_b,
  9343. LLM_NORM, cb, -1);
  9344. cb(cur, "result_norm", -1);
  9345. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9346. cb(cur, "result_output", -1);
  9347. ggml_build_forward_expand(gf, cur);
  9348. return gf;
  9349. }
  9350. struct ggml_cgraph * build_refact() {
  9351. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9352. const int64_t n_embd_head = hparams.n_embd_head_v;
  9353. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9354. struct ggml_tensor * cur;
  9355. struct ggml_tensor * inpL;
  9356. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9357. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9358. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9359. for (int il = 0; il < n_layer; ++il) {
  9360. struct ggml_tensor * inpSA = inpL;
  9361. cur = llm_build_norm(ctx0, inpL, hparams,
  9362. model.layers[il].attn_norm, NULL,
  9363. LLM_NORM_RMS, cb, il);
  9364. cb(cur, "attn_norm", il);
  9365. // self-attention
  9366. {
  9367. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  9368. cb(Qcur, "Qcur", il);
  9369. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  9370. cb(Kcur, "Kcur", il);
  9371. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  9372. cb(Vcur, "Vcur", il);
  9373. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9374. cb(Kcur, "Kcur", il);
  9375. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9376. cb(Qcur, "Qcur", il);
  9377. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9378. model.layers[il].wo, NULL,
  9379. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9380. }
  9381. if (il == n_layer - 1) {
  9382. // skip computing output for unused tokens
  9383. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9384. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9385. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9386. }
  9387. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9388. cb(ffn_inp, "ffn_inp", il);
  9389. // feed-forward network
  9390. {
  9391. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9392. model.layers[il].ffn_norm, NULL,
  9393. LLM_NORM_RMS, cb, il);
  9394. cb(cur, "ffn_norm", il);
  9395. cur = llm_build_ffn(ctx0, lctx, cur,
  9396. model.layers[il].ffn_up, NULL, NULL,
  9397. model.layers[il].ffn_gate, NULL, NULL,
  9398. model.layers[il].ffn_down, NULL, NULL,
  9399. NULL,
  9400. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  9401. cb(cur, "ffn_out", il);
  9402. }
  9403. cur = ggml_add(ctx0, cur, ffn_inp);
  9404. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9405. cb(cur, "l_out", il);
  9406. // input for next layer
  9407. inpL = cur;
  9408. }
  9409. cur = inpL;
  9410. cur = llm_build_norm(ctx0, cur, hparams,
  9411. model.output_norm, NULL,
  9412. LLM_NORM_RMS, cb, -1);
  9413. cb(cur, "result_norm", -1);
  9414. // lm_head
  9415. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9416. cb(cur, "result_output", -1);
  9417. ggml_build_forward_expand(gf, cur);
  9418. return gf;
  9419. }
  9420. struct ggml_cgraph * build_bert() {
  9421. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9422. const int64_t n_embd_head = hparams.n_embd_head_v;
  9423. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9424. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9425. struct ggml_tensor * cur;
  9426. struct ggml_tensor * inpL;
  9427. struct ggml_tensor * inp_pos = nullptr;
  9428. if (model.arch != LLM_ARCH_JINA_BERT_V2) {
  9429. inp_pos = build_inp_pos();
  9430. }
  9431. // construct input embeddings (token, type, position)
  9432. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9433. // token types are hardcoded to zero ("Sentence A")
  9434. struct ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
  9435. inpL = ggml_add(ctx0, inpL, type_row0);
  9436. if (model.arch == LLM_ARCH_BERT) {
  9437. inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
  9438. }
  9439. cb(inpL, "inp_embd", -1);
  9440. // embed layer norm
  9441. inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
  9442. cb(inpL, "inp_norm", -1);
  9443. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9444. struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
  9445. // iterate layers
  9446. for (int il = 0; il < n_layer; ++il) {
  9447. struct ggml_tensor * cur = inpL;
  9448. struct ggml_tensor * Qcur;
  9449. struct ggml_tensor * Kcur;
  9450. struct ggml_tensor * Vcur;
  9451. // self-attention
  9452. if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
  9453. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur), model.layers[il].bq);
  9454. cb(Qcur, "Qcur", il);
  9455. if (model.layers[il].attn_q_norm) {
  9456. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  9457. model.layers[il].attn_q_norm,
  9458. model.layers[il].attn_q_norm_b,
  9459. LLM_NORM, cb, il);
  9460. }
  9461. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur), model.layers[il].bk);
  9462. cb(Kcur, "Kcur", il);
  9463. if (model.layers[il].attn_k_norm) {
  9464. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  9465. model.layers[il].attn_k_norm,
  9466. model.layers[il].attn_k_norm_b,
  9467. LLM_NORM, cb, il);
  9468. }
  9469. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur), model.layers[il].bv);
  9470. cb(Vcur, "Vcur", il);
  9471. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9472. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9473. } else {
  9474. // compute Q and K and RoPE them
  9475. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9476. cb(cur, "wqkv", il);
  9477. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9478. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9479. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9480. cb(Qcur, "Qcur", il);
  9481. cb(Kcur, "Kcur", il);
  9482. cb(Vcur, "Vcur", il);
  9483. Qcur = ggml_rope_ext(
  9484. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  9485. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9486. ext_factor, attn_factor, beta_fast, beta_slow
  9487. );
  9488. cb(Qcur, "Qcur", il);
  9489. Kcur = ggml_rope_ext(
  9490. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  9491. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9492. ext_factor, attn_factor, beta_fast, beta_slow
  9493. );
  9494. cb(Kcur, "Kcur", il);
  9495. }
  9496. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  9497. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  9498. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  9499. cb(kq, "kq", il);
  9500. kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
  9501. cb(kq, "kq_soft_max_ext", il);
  9502. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
  9503. cb(v, "v", il);
  9504. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
  9505. cb(kqv, "kqv", il);
  9506. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  9507. cb(kqv_merged, "kqv_merged", il);
  9508. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  9509. cb(cur, "kqv_merged_cont", il);
  9510. ggml_build_forward_expand(gf, cur);
  9511. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  9512. if (model.layers[il].bo) {
  9513. cb(cur, "kqv_wo", il);
  9514. }
  9515. if (model.layers[il].bo) {
  9516. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  9517. }
  9518. cb(cur, "kqv_out", il);
  9519. if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
  9520. // skip computing output for unused tokens
  9521. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9522. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9523. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9524. }
  9525. // re-add the layer input
  9526. cur = ggml_add(ctx0, cur, inpL);
  9527. // attention layer norm
  9528. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
  9529. if (model.layers[il].attn_norm_2 != nullptr) {
  9530. cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
  9531. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
  9532. }
  9533. struct ggml_tensor * ffn_inp = cur;
  9534. cb(ffn_inp, "ffn_inp", il);
  9535. // feed-forward network
  9536. if (model.arch == LLM_ARCH_BERT) {
  9537. cur = llm_build_ffn(ctx0, lctx, cur,
  9538. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9539. NULL, NULL, NULL,
  9540. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9541. NULL,
  9542. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  9543. } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
  9544. cur = llm_build_ffn(ctx0, lctx, cur,
  9545. model.layers[il].ffn_up, NULL, NULL,
  9546. model.layers[il].ffn_gate, NULL, NULL,
  9547. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9548. NULL,
  9549. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  9550. } else {
  9551. cur = llm_build_ffn(ctx0, lctx, cur,
  9552. model.layers[il].ffn_up, NULL, NULL,
  9553. model.layers[il].ffn_gate, NULL, NULL,
  9554. model.layers[il].ffn_down, NULL, NULL,
  9555. NULL,
  9556. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  9557. }
  9558. cb(cur, "ffn_out", il);
  9559. // attentions bypass the intermediate layer
  9560. cur = ggml_add(ctx0, cur, ffn_inp);
  9561. // output layer norm
  9562. cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].layer_out_norm, model.layers[il].layer_out_norm_b, LLM_NORM, cb, il);
  9563. // input for next layer
  9564. inpL = cur;
  9565. }
  9566. // final output
  9567. cur = inpL;
  9568. cb(cur, "result_embd", -1);
  9569. ggml_build_forward_expand(gf, cur);
  9570. return gf;
  9571. }
  9572. struct ggml_cgraph * build_bloom() {
  9573. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9574. const int64_t n_embd_head = hparams.n_embd_head_v;
  9575. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9576. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9577. struct ggml_tensor * cur;
  9578. struct ggml_tensor * inpL;
  9579. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9580. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9581. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9582. inpL = llm_build_norm(ctx0, inpL, hparams,
  9583. model.tok_norm,
  9584. model.tok_norm_b,
  9585. LLM_NORM, cb, -1);
  9586. cb(inpL, "inp_norm", -1);
  9587. for (int il = 0; il < n_layer; ++il) {
  9588. cur = llm_build_norm(ctx0, inpL, hparams,
  9589. model.layers[il].attn_norm,
  9590. model.layers[il].attn_norm_b,
  9591. LLM_NORM, cb, il);
  9592. cb(cur, "attn_norm", il);
  9593. // self-attention
  9594. {
  9595. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9596. cb(cur, "wqkv", il);
  9597. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9598. cb(cur, "bqkv", il);
  9599. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9600. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9601. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9602. cb(Qcur, "Qcur", il);
  9603. cb(Kcur, "Kcur", il);
  9604. cb(Vcur, "Vcur", il);
  9605. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9606. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9607. model.layers[il].wo, model.layers[il].bo,
  9608. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9609. }
  9610. if (il == n_layer - 1) {
  9611. // skip computing output for unused tokens
  9612. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9613. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9614. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9615. }
  9616. // Add the input
  9617. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9618. cb(ffn_inp, "ffn_inp", il);
  9619. // FF
  9620. {
  9621. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9622. model.layers[il].ffn_norm,
  9623. model.layers[il].ffn_norm_b,
  9624. LLM_NORM, cb, il);
  9625. cb(cur, "ffn_norm", il);
  9626. cur = llm_build_ffn(ctx0, lctx, cur,
  9627. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9628. NULL, NULL, NULL,
  9629. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9630. NULL,
  9631. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  9632. cb(cur, "ffn_out", il);
  9633. }
  9634. cur = ggml_add(ctx0, cur, ffn_inp);
  9635. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9636. cb(cur, "l_out", il);
  9637. // input for next layer
  9638. inpL = cur;
  9639. }
  9640. cur = llm_build_norm(ctx0, inpL, hparams,
  9641. model.output_norm,
  9642. model.output_norm_b,
  9643. LLM_NORM, cb, -1);
  9644. cb(cur, "result_norm", -1);
  9645. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9646. cb(cur, "result_output", -1);
  9647. ggml_build_forward_expand(gf, cur);
  9648. return gf;
  9649. }
  9650. struct ggml_cgraph * build_mpt() {
  9651. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9652. const int64_t n_embd_head = hparams.n_embd_head_v;
  9653. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  9654. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9655. struct ggml_tensor * cur;
  9656. struct ggml_tensor * pos;
  9657. struct ggml_tensor * inpL;
  9658. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9659. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9660. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9661. if (model.pos_embd) {
  9662. // inp_pos - contains the positions
  9663. struct ggml_tensor * inp_pos = build_inp_pos();
  9664. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  9665. cb(pos, "pos_embd", -1);
  9666. inpL = ggml_add(ctx0, inpL, pos);
  9667. cb(inpL, "inpL", -1);
  9668. }
  9669. for (int il = 0; il < n_layer; ++il) {
  9670. struct ggml_tensor * attn_norm;
  9671. attn_norm = llm_build_norm(ctx0, inpL, hparams,
  9672. model.layers[il].attn_norm,
  9673. model.layers[il].attn_norm_b,
  9674. LLM_NORM, cb, il);
  9675. cb(attn_norm, "attn_norm", il);
  9676. // self-attention
  9677. {
  9678. cur = attn_norm;
  9679. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9680. cb(cur, "wqkv", il);
  9681. if (model.layers[il].bqkv){
  9682. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9683. cb(cur, "bqkv", il);
  9684. }
  9685. if (hparams.f_clamp_kqv > 0.0f) {
  9686. cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  9687. cb(cur, "wqkv_clamped", il);
  9688. }
  9689. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9690. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9691. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  9692. cb(Qcur, "Qcur", il);
  9693. cb(Kcur, "Kcur", il);
  9694. cb(Vcur, "Vcur", il);
  9695. // Q/K Layernorm
  9696. if (model.layers[il].attn_q_norm) {
  9697. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  9698. model.layers[il].attn_q_norm,
  9699. model.layers[il].attn_q_norm_b,
  9700. LLM_NORM, cb, il);
  9701. cb(Qcur, "Qcur", il);
  9702. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  9703. model.layers[il].attn_k_norm,
  9704. model.layers[il].attn_k_norm_b,
  9705. LLM_NORM, cb, il);
  9706. cb(Kcur, "Kcur", il);
  9707. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9708. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9709. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9710. model.layers[il].wo, model.layers[il].bo,
  9711. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9712. } else {
  9713. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9714. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9715. model.layers[il].wo, model.layers[il].bo,
  9716. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9717. }
  9718. }
  9719. if (il == n_layer - 1) {
  9720. // skip computing output for unused tokens
  9721. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9722. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9723. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9724. }
  9725. // Add the input
  9726. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9727. cb(ffn_inp, "ffn_inp", il);
  9728. // feed forward
  9729. {
  9730. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9731. model.layers[il].ffn_norm,
  9732. model.layers[il].ffn_norm_b,
  9733. LLM_NORM, cb, il);
  9734. cb(cur, "ffn_norm", il);
  9735. cur = llm_build_ffn(ctx0, lctx, cur,
  9736. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  9737. NULL, NULL, NULL,
  9738. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  9739. model.layers[il].ffn_act,
  9740. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  9741. cb(cur, "ffn_out", il);
  9742. }
  9743. cur = ggml_add(ctx0, cur, ffn_inp);
  9744. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9745. cb(cur, "l_out", il);
  9746. // input for next layer
  9747. inpL = cur;
  9748. }
  9749. cur = inpL;
  9750. cur = llm_build_norm(ctx0, cur, hparams,
  9751. model.output_norm,
  9752. model.output_norm_b,
  9753. LLM_NORM, cb, -1);
  9754. cb(cur, "result_norm", -1);
  9755. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9756. cb(cur, "result_output", -1);
  9757. ggml_build_forward_expand(gf, cur);
  9758. return gf;
  9759. }
  9760. struct ggml_cgraph * build_stablelm() {
  9761. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  9762. const int64_t n_embd_head = hparams.n_embd_head_v;
  9763. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9764. struct ggml_tensor * cur;
  9765. struct ggml_tensor * inpL;
  9766. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9767. // inp_pos - contains the positions
  9768. struct ggml_tensor * inp_pos = build_inp_pos();
  9769. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9770. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9771. for (int il = 0; il < n_layer; ++il) {
  9772. // norm
  9773. cur = llm_build_norm(ctx0, inpL, hparams,
  9774. model.layers[il].attn_norm,
  9775. model.layers[il].attn_norm_b,
  9776. LLM_NORM, cb, il);
  9777. cb(cur, "attn_norm", il);
  9778. struct ggml_tensor * inpSA = cur;
  9779. // self-attention
  9780. {
  9781. // compute Q and K and RoPE them
  9782. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  9783. cb(Qcur, "Qcur", il);
  9784. if (model.layers[il].bq) {
  9785. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9786. cb(Qcur, "Qcur", il);
  9787. }
  9788. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  9789. cb(Kcur, "Kcur", il);
  9790. if (model.layers[il].bk) {
  9791. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9792. cb(Kcur, "Kcur", il);
  9793. }
  9794. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  9795. cb(Vcur, "Vcur", il);
  9796. if (model.layers[il].bv) {
  9797. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9798. cb(Vcur, "Vcur", il);
  9799. }
  9800. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9801. cb(Qcur, "Qcur", il);
  9802. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9803. cb(Kcur, "Kcur", il);
  9804. if (model.layers[il].attn_q_norm) {
  9805. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  9806. model.layers[il].attn_q_norm,
  9807. NULL,
  9808. LLM_NORM, cb, il);
  9809. cb(Qcur, "Qcur", il);
  9810. }
  9811. if (model.layers[il].attn_k_norm) {
  9812. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  9813. model.layers[il].attn_k_norm,
  9814. NULL,
  9815. LLM_NORM, cb, il);
  9816. cb(Kcur, "Kcur", il);
  9817. }
  9818. Qcur = ggml_rope_ext(
  9819. ctx0, Qcur, inp_pos, nullptr,
  9820. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9821. ext_factor, attn_factor, beta_fast, beta_slow
  9822. );
  9823. cb(Qcur, "Qcur", il);
  9824. Kcur = ggml_rope_ext(
  9825. ctx0, Kcur, inp_pos, nullptr,
  9826. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  9827. ext_factor, attn_factor, beta_fast, beta_slow
  9828. );
  9829. cb(Kcur, "Kcur", il);
  9830. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9831. model.layers[il].wo, NULL,
  9832. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9833. }
  9834. if (il == n_layer - 1) {
  9835. // skip computing output for unused tokens
  9836. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9837. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9838. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  9839. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9840. }
  9841. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  9842. cb(ffn_inp, "ffn_inp", il);
  9843. // feed-forward network
  9844. {
  9845. if (model.layers[il].ffn_norm) {
  9846. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9847. model.layers[il].ffn_norm,
  9848. model.layers[il].ffn_norm_b,
  9849. LLM_NORM, cb, il);
  9850. cb(cur, "ffn_norm", il);
  9851. } else {
  9852. // parallel residual
  9853. cur = inpSA;
  9854. }
  9855. cur = llm_build_ffn(ctx0, lctx, cur,
  9856. model.layers[il].ffn_up, NULL, NULL,
  9857. model.layers[il].ffn_gate, NULL, NULL,
  9858. model.layers[il].ffn_down, NULL, NULL,
  9859. NULL,
  9860. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  9861. cb(cur, "ffn_out", il);
  9862. }
  9863. cur = ggml_add(ctx0, cur, ffn_inp);
  9864. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9865. cb(cur, "l_out", il);
  9866. // input for next layer
  9867. inpL = cur;
  9868. }
  9869. cur = inpL;
  9870. cur = llm_build_norm(ctx0, cur, hparams,
  9871. model.output_norm,
  9872. model.output_norm_b,
  9873. LLM_NORM, cb, -1);
  9874. cb(cur, "result_norm", -1);
  9875. // lm_head
  9876. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9877. cb(cur, "result_output", -1);
  9878. ggml_build_forward_expand(gf, cur);
  9879. return gf;
  9880. }
  9881. struct ggml_cgraph * build_qwen() {
  9882. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9883. const int64_t n_embd_head = hparams.n_embd_head_v;
  9884. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9885. struct ggml_tensor * cur;
  9886. struct ggml_tensor * inpL;
  9887. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9888. // inp_pos - contains the positions
  9889. struct ggml_tensor * inp_pos = build_inp_pos();
  9890. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9891. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9892. for (int il = 0; il < n_layer; ++il) {
  9893. struct ggml_tensor * inpSA = inpL;
  9894. cur = llm_build_norm(ctx0, inpL, hparams,
  9895. model.layers[il].attn_norm, NULL,
  9896. LLM_NORM_RMS, cb, il);
  9897. cb(cur, "attn_norm", il);
  9898. // self-attention
  9899. {
  9900. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  9901. cb(cur, "wqkv", il);
  9902. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  9903. cb(cur, "bqkv", il);
  9904. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  9905. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  9906. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
  9907. cb(Qcur, "Qcur", il);
  9908. cb(Kcur, "Kcur", il);
  9909. cb(Vcur, "Vcur", il);
  9910. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  9911. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  9912. // using mode = 2 for neox mode
  9913. Qcur = ggml_rope_ext(
  9914. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  9915. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  9916. );
  9917. cb(Qcur, "Qcur", il);
  9918. Kcur = ggml_rope_ext(
  9919. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  9920. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  9921. );
  9922. cb(Kcur, "Kcur", il);
  9923. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  9924. model.layers[il].wo, NULL,
  9925. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  9926. }
  9927. if (il == n_layer - 1) {
  9928. // skip computing output for unused tokens
  9929. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  9930. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  9931. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  9932. }
  9933. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  9934. cb(ffn_inp, "ffn_inp", il);
  9935. // feed-forward forward
  9936. {
  9937. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  9938. model.layers[il].ffn_norm, NULL,
  9939. LLM_NORM_RMS, cb, il);
  9940. cb(cur, "ffn_norm", il);
  9941. cur = llm_build_ffn(ctx0, lctx, cur,
  9942. model.layers[il].ffn_up, NULL, NULL,
  9943. model.layers[il].ffn_gate, NULL, NULL,
  9944. model.layers[il].ffn_down, NULL, NULL,
  9945. NULL,
  9946. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  9947. cb(cur, "ffn_out", il);
  9948. }
  9949. cur = ggml_add(ctx0, cur, ffn_inp);
  9950. cur = lctx.cvec.apply_to(ctx0, cur, il);
  9951. cb(cur, "l_out", il);
  9952. // input for next layer
  9953. inpL = cur;
  9954. }
  9955. cur = inpL;
  9956. cur = llm_build_norm(ctx0, cur, hparams,
  9957. model.output_norm, NULL,
  9958. LLM_NORM_RMS, cb, -1);
  9959. cb(cur, "result_norm", -1);
  9960. // lm_head
  9961. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  9962. cb(cur, "result_output", -1);
  9963. ggml_build_forward_expand(gf, cur);
  9964. return gf;
  9965. }
  9966. struct ggml_cgraph * build_qwen2() {
  9967. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  9968. const int64_t n_embd_head = hparams.n_embd_head_v;
  9969. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  9970. GGML_ASSERT(n_embd_head == hparams.n_rot);
  9971. struct ggml_tensor * cur;
  9972. struct ggml_tensor * inpL;
  9973. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  9974. // inp_pos - contains the positions
  9975. struct ggml_tensor * inp_pos = build_inp_pos();
  9976. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  9977. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  9978. for (int il = 0; il < n_layer; ++il) {
  9979. struct ggml_tensor * inpSA = inpL;
  9980. // norm
  9981. cur = llm_build_norm(ctx0, inpL, hparams,
  9982. model.layers[il].attn_norm, NULL,
  9983. LLM_NORM_RMS, cb, il);
  9984. cb(cur, "attn_norm", il);
  9985. // self-attention
  9986. {
  9987. // compute Q and K and RoPE them
  9988. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  9989. cb(Qcur, "Qcur", il);
  9990. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  9991. cb(Qcur, "Qcur", il);
  9992. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  9993. cb(Kcur, "Kcur", il);
  9994. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  9995. cb(Kcur, "Kcur", il);
  9996. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  9997. cb(Vcur, "Vcur", il);
  9998. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  9999. cb(Vcur, "Vcur", il);
  10000. Qcur = ggml_rope_ext(
  10001. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10002. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10003. ext_factor, attn_factor, beta_fast, beta_slow
  10004. );
  10005. cb(Qcur, "Qcur", il);
  10006. Kcur = ggml_rope_ext(
  10007. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10008. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10009. ext_factor, attn_factor, beta_fast, beta_slow
  10010. );
  10011. cb(Kcur, "Kcur", il);
  10012. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10013. model.layers[il].wo, model.layers[il].bo,
  10014. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10015. }
  10016. if (il == n_layer - 1) {
  10017. // skip computing output for unused tokens
  10018. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10019. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10020. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10021. }
  10022. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10023. cb(ffn_inp, "ffn_inp", il);
  10024. // feed-forward network
  10025. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10026. model.layers[il].ffn_norm, NULL,
  10027. LLM_NORM_RMS, cb, il);
  10028. cb(cur, "ffn_norm", il);
  10029. cur = llm_build_ffn(ctx0, lctx, cur,
  10030. model.layers[il].ffn_up, NULL, NULL,
  10031. model.layers[il].ffn_gate, NULL, NULL,
  10032. model.layers[il].ffn_down, NULL, NULL,
  10033. NULL,
  10034. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10035. cb(cur, "ffn_out", il);
  10036. cur = ggml_add(ctx0, cur, ffn_inp);
  10037. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10038. cb(cur, "l_out", il);
  10039. // input for next layer
  10040. inpL = cur;
  10041. }
  10042. cur = inpL;
  10043. cur = llm_build_norm(ctx0, cur, hparams,
  10044. model.output_norm, NULL,
  10045. LLM_NORM_RMS, cb, -1);
  10046. cb(cur, "result_norm", -1);
  10047. // lm_head
  10048. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10049. cb(cur, "result_output", -1);
  10050. ggml_build_forward_expand(gf, cur);
  10051. return gf;
  10052. }
  10053. struct ggml_cgraph * build_qwen2moe() {
  10054. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10055. // mutable variable, needed during the last layer of the computation to skip unused tokens
  10056. int32_t n_tokens = this->n_tokens;
  10057. const int64_t n_embd_head = hparams.n_embd_head_v;
  10058. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10059. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10060. struct ggml_tensor * cur;
  10061. struct ggml_tensor * inpL;
  10062. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10063. // inp_pos - contains the positions
  10064. struct ggml_tensor * inp_pos = build_inp_pos();
  10065. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10066. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10067. for (int il = 0; il < n_layer; ++il) {
  10068. struct ggml_tensor * inpSA = inpL;
  10069. // norm
  10070. cur = llm_build_norm(ctx0, inpL, hparams,
  10071. model.layers[il].attn_norm, NULL,
  10072. LLM_NORM_RMS, cb, il);
  10073. cb(cur, "attn_norm", il);
  10074. // self_attention
  10075. {
  10076. // compute Q and K and RoPE them
  10077. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10078. cb(Qcur, "Qcur", il);
  10079. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10080. cb(Qcur, "Qcur", il);
  10081. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10082. cb(Kcur, "Kcur", il);
  10083. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10084. cb(Kcur, "Kcur", il);
  10085. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10086. cb(Vcur, "Vcur", il);
  10087. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10088. cb(Vcur, "Vcur", il);
  10089. Qcur = ggml_rope_ext(
  10090. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10091. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10092. ext_factor, attn_factor, beta_fast, beta_slow
  10093. );
  10094. cb(Qcur, "Qcur", il);
  10095. Kcur = ggml_rope_ext(
  10096. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10097. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10098. ext_factor, attn_factor, beta_fast, beta_slow
  10099. );
  10100. cb(Kcur, "Kcur", il);
  10101. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10102. model.layers[il].wo, model.layers[il].bo,
  10103. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10104. }
  10105. if (il == n_layer - 1) {
  10106. // skip computing output for unused tokens
  10107. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10108. n_tokens = n_outputs;
  10109. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10110. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10111. }
  10112. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10113. cb(ffn_inp, "ffn_inp", il);
  10114. // MoE branch
  10115. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10116. model.layers[il].ffn_norm, NULL,
  10117. LLM_NORM_RMS, cb, il);
  10118. cb(cur, "ffn_norm", il);
  10119. ggml_tensor * moe_out =
  10120. llm_build_moe_ffn(ctx0, lctx, cur,
  10121. model.layers[il].ffn_gate_inp,
  10122. model.layers[il].ffn_up_exps,
  10123. model.layers[il].ffn_gate_exps,
  10124. model.layers[il].ffn_down_exps,
  10125. n_expert, n_expert_used,
  10126. LLM_FFN_SILU, false,
  10127. false, 0.0,
  10128. cb, il);
  10129. cb(cur, "ffn_moe_out", il);
  10130. // FFN shared expert
  10131. {
  10132. ggml_tensor * cur_gate_inp = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
  10133. cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
  10134. // sigmoid
  10135. ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
  10136. cb(cur_gate, "ffn_shexp_gate", il);
  10137. ggml_tensor * cur_ffn = llm_build_ffn(ctx0, lctx, cur,
  10138. model.layers[il].ffn_up_shexp, NULL, NULL,
  10139. model.layers[il].ffn_gate_shexp, NULL, NULL,
  10140. model.layers[il].ffn_down_shexp, NULL, NULL,
  10141. NULL,
  10142. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10143. cb(cur_ffn, "ffn_shexp", il);
  10144. ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
  10145. cb(ffn_shexp_out, "ffn_shexp_out", il);
  10146. moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
  10147. cb(moe_out, "ffn_out", il);
  10148. cur = moe_out;
  10149. }
  10150. cur = ggml_add(ctx0, cur, ffn_inp);
  10151. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10152. cb(cur, "l_out", il);
  10153. // input for next layer
  10154. inpL = cur;
  10155. }
  10156. cur = inpL;
  10157. cur = llm_build_norm(ctx0, cur, hparams,
  10158. model.output_norm, NULL,
  10159. LLM_NORM_RMS, cb, -1);
  10160. cb(cur, "result_norm", -1);
  10161. // lm_head
  10162. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10163. cb(cur, "result_output", -1);
  10164. ggml_build_forward_expand(gf, cur);
  10165. return gf;
  10166. }
  10167. struct ggml_cgraph * build_phi2() {
  10168. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10169. const int64_t n_embd_head = hparams.n_embd_head_v;
  10170. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10171. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10172. struct ggml_tensor * cur;
  10173. struct ggml_tensor * attn_norm_output;
  10174. struct ggml_tensor * ffn_output;
  10175. struct ggml_tensor * inpL;
  10176. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10177. // inp_pos - contains the positions
  10178. struct ggml_tensor * inp_pos = build_inp_pos();
  10179. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10180. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10181. for (int il = 0; il < n_layer; ++il) {
  10182. attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
  10183. model.layers[il].attn_norm,
  10184. model.layers[il].attn_norm_b,
  10185. LLM_NORM, cb, il);
  10186. cb(attn_norm_output, "attn_norm", il);
  10187. // self-attention
  10188. {
  10189. struct ggml_tensor * Qcur = nullptr;
  10190. struct ggml_tensor * Kcur = nullptr;
  10191. struct ggml_tensor * Vcur = nullptr;
  10192. if (model.layers[il].wqkv) {
  10193. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
  10194. cb(cur, "wqkv", il);
  10195. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10196. cb(cur, "bqkv", il);
  10197. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  10198. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  10199. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  10200. } else {
  10201. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  10202. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  10203. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  10204. }
  10205. cb(Qcur, "Qcur", il);
  10206. cb(Kcur, "Kcur", il);
  10207. cb(Vcur, "Vcur", il);
  10208. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10209. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10210. Qcur = ggml_rope_ext(
  10211. ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  10212. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  10213. );
  10214. cb(Qcur, "Qcur", il);
  10215. // with phi2, we scale the Q to avoid precision issues
  10216. // ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
  10217. Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
  10218. cb(Qcur, "Qcur", il);
  10219. Kcur = ggml_rope_ext(
  10220. ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
  10221. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  10222. );
  10223. cb(Kcur, "Kcur", il);
  10224. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10225. model.layers[il].wo, model.layers[il].bo,
  10226. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  10227. }
  10228. if (il == n_layer - 1) {
  10229. // skip computing output for unused tokens
  10230. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10231. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10232. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10233. attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
  10234. }
  10235. // FF
  10236. {
  10237. ffn_output = llm_build_ffn(ctx0, lctx, attn_norm_output,
  10238. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10239. NULL, NULL, NULL,
  10240. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10241. NULL,
  10242. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  10243. cb(ffn_output, "ffn_out", il);
  10244. }
  10245. cur = ggml_add(ctx0, cur, ffn_output);
  10246. cur = ggml_add(ctx0, cur, inpL);
  10247. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10248. cb(cur, "l_out", il);
  10249. // input for next layer
  10250. inpL = cur;
  10251. }
  10252. cur = llm_build_norm(ctx0, inpL, hparams,
  10253. model.output_norm,
  10254. model.output_norm_b,
  10255. LLM_NORM, cb, -1);
  10256. cb(cur, "result_norm", -1);
  10257. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10258. cb(cur, "result_output_no_bias", -1);
  10259. cur = ggml_add(ctx0, cur, model.output_b);
  10260. cb(cur, "result_output", -1);
  10261. ggml_build_forward_expand(gf, cur);
  10262. return gf;
  10263. }
  10264. struct ggml_cgraph * build_phi3() {
  10265. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10266. const int64_t n_embd_head = hparams.n_embd_head_v;
  10267. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10268. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10269. struct ggml_tensor * cur;
  10270. struct ggml_tensor * inpL;
  10271. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10272. // inp_pos - contains the positions
  10273. struct ggml_tensor * inp_pos = build_inp_pos();
  10274. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10275. struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa();
  10276. for (int il = 0; il < n_layer; ++il) {
  10277. auto residual = inpL;
  10278. // self-attention
  10279. {
  10280. // rope freq factors for 128k context
  10281. struct ggml_tensor * rope_factors = build_rope_factors(il);
  10282. struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
  10283. model.layers[il].attn_norm,
  10284. NULL,
  10285. LLM_NORM_RMS, cb, il);
  10286. cb(attn_norm_output, "attn_norm", il);
  10287. struct ggml_tensor * Qcur = nullptr;
  10288. struct ggml_tensor * Kcur = nullptr;
  10289. struct ggml_tensor * Vcur = nullptr;
  10290. if (model.layers[il].wqkv) {
  10291. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, attn_norm_output);
  10292. cb(cur, "wqkv", il);
  10293. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
  10294. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
  10295. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
  10296. }
  10297. else {
  10298. Qcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
  10299. Kcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
  10300. Vcur = ggml_add(ctx0, llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
  10301. }
  10302. cb(Qcur, "Qcur", il);
  10303. cb(Kcur, "Kcur", il);
  10304. cb(Vcur, "Vcur", il);
  10305. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10306. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  10307. Qcur = ggml_rope_ext(
  10308. ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
  10309. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  10310. );
  10311. cb(Qcur, "Qcur", il);
  10312. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
  10313. cb(Qcur, "Qcur", il);
  10314. Kcur = ggml_rope_ext(
  10315. ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
  10316. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  10317. );
  10318. cb(Kcur, "Kcur", il);
  10319. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10320. model.layers[il].wo, model.layers[il].bo,
  10321. Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  10322. }
  10323. if (il == n_layer - 1) {
  10324. // skip computing output for unused tokens
  10325. struct ggml_tensor* inp_out_ids = build_inp_out_ids();
  10326. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10327. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  10328. }
  10329. cur = ggml_add(ctx0, cur, residual);
  10330. residual = cur;
  10331. cur = llm_build_norm(ctx0, cur, hparams,
  10332. model.layers[il].ffn_norm, NULL,
  10333. LLM_NORM_RMS, cb, il);
  10334. cb(cur, "ffn_norm", il);
  10335. // FF
  10336. // special-case: the up and gate tensors are merged into a single tensor
  10337. // TOOD: support into llm_build_ffn
  10338. {
  10339. cur = llm_build_ffn(ctx0, lctx, cur,
  10340. model.layers[il].ffn_up, NULL, NULL,
  10341. NULL, NULL, NULL,
  10342. model.layers[il].ffn_down, NULL, NULL,
  10343. NULL,
  10344. LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
  10345. cb(cur, "ffn_out", il);
  10346. }
  10347. cur = ggml_add(ctx0, residual, cur);
  10348. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10349. cb(cur, "l_out", il);
  10350. // input for next layer
  10351. inpL = cur;
  10352. }
  10353. cur = llm_build_norm(ctx0, inpL, hparams,
  10354. model.output_norm,
  10355. NULL,
  10356. LLM_NORM_RMS, cb, -1);
  10357. cb(cur, "result_norm", -1);
  10358. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10359. cb(cur, "result_output", -1);
  10360. ggml_build_forward_expand(gf, cur);
  10361. return gf;
  10362. }
  10363. struct ggml_cgraph * build_plamo() {
  10364. struct ggml_cgraph * gf = ggml_new_graph(ctx0);
  10365. const int64_t n_embd_head = hparams.n_embd_head_v;
  10366. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10367. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10368. struct ggml_tensor * cur;
  10369. struct ggml_tensor * inpL;
  10370. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10371. // inp_pos - contains the positions
  10372. struct ggml_tensor * inp_pos = build_inp_pos();
  10373. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10374. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10375. for (int il = 0; il < n_layer; ++il) {
  10376. // norm
  10377. cur = llm_build_norm(ctx0, inpL, hparams,
  10378. model.layers[il].attn_norm, NULL,
  10379. LLM_NORM_RMS, cb, il);
  10380. cb(cur, "attn_norm", il);
  10381. struct ggml_tensor * attention_norm = cur;
  10382. // self-attention
  10383. {
  10384. // compute Q and K and RoPE them
  10385. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10386. cb(Qcur, "Qcur", il);
  10387. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10388. cb(Kcur, "Kcur", il);
  10389. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10390. cb(Vcur, "Vcur", il);
  10391. Qcur = ggml_rope_ext(
  10392. ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
  10393. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  10394. ext_factor, attn_factor, beta_fast, beta_slow);
  10395. cb(Qcur, "Qcur", il);
  10396. Kcur = ggml_rope_ext(
  10397. ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
  10398. n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
  10399. ext_factor, attn_factor, beta_fast, beta_slow);
  10400. cb(Kcur, "Kcur", il);
  10401. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10402. model.layers[il].wo, NULL,
  10403. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10404. }
  10405. struct ggml_tensor * sa_out = cur;
  10406. cur = attention_norm;
  10407. if (il == n_layer - 1) {
  10408. // skip computing output for unused tokens
  10409. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10410. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10411. sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
  10412. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10413. }
  10414. // feed-forward network
  10415. {
  10416. cur = llm_build_ffn(ctx0, lctx, cur,
  10417. model.layers[il].ffn_up, NULL, NULL,
  10418. model.layers[il].ffn_gate, NULL, NULL,
  10419. model.layers[il].ffn_down, NULL, NULL,
  10420. NULL,
  10421. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10422. cb(cur, "ffn_out", il);
  10423. }
  10424. cur = ggml_add(ctx0, cur, sa_out);
  10425. cur = ggml_add(ctx0, cur, inpL);
  10426. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10427. cb(cur, "l_out", il);
  10428. // input for next layer
  10429. inpL = cur;
  10430. }
  10431. cur = inpL;
  10432. cur = llm_build_norm(ctx0, cur, hparams,
  10433. model.output_norm, NULL,
  10434. LLM_NORM_RMS, cb, -1);
  10435. cb(cur, "result_norm", -1);
  10436. // lm_head
  10437. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10438. cb(cur, "result_output", -1);
  10439. ggml_build_forward_expand(gf, cur);
  10440. return gf;
  10441. }
  10442. struct ggml_cgraph * build_gpt2() {
  10443. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10444. const int64_t n_embd_head = hparams.n_embd_head_v;
  10445. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10446. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10447. struct ggml_tensor * cur;
  10448. struct ggml_tensor * pos;
  10449. struct ggml_tensor * inpL;
  10450. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10451. // inp_pos - contains the positions
  10452. struct ggml_tensor * inp_pos = build_inp_pos();
  10453. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10454. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10455. pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
  10456. cb(pos, "pos_embd", -1);
  10457. inpL = ggml_add(ctx0, inpL, pos);
  10458. cb(inpL, "inpL", -1);
  10459. for (int il = 0; il < n_layer; ++il) {
  10460. cur = llm_build_norm(ctx0, inpL, hparams,
  10461. model.layers[il].attn_norm,
  10462. model.layers[il].attn_norm_b,
  10463. LLM_NORM, cb, il);
  10464. cb(cur, "attn_norm", il);
  10465. // self-attention
  10466. {
  10467. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  10468. cb(cur, "wqkv", il);
  10469. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10470. cb(cur, "bqkv", il);
  10471. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  10472. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  10473. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  10474. cb(Qcur, "Qcur", il);
  10475. cb(Kcur, "Kcur", il);
  10476. cb(Vcur, "Vcur", il);
  10477. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  10478. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10479. model.layers[il].wo, model.layers[il].bo,
  10480. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10481. }
  10482. if (il == n_layer - 1) {
  10483. // skip computing output for unused tokens
  10484. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10485. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10486. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10487. }
  10488. // add the input
  10489. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10490. cb(ffn_inp, "ffn_inp", il);
  10491. // FF
  10492. {
  10493. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10494. model.layers[il].ffn_norm,
  10495. model.layers[il].ffn_norm_b,
  10496. LLM_NORM, cb, il);
  10497. cb(cur, "ffn_norm", il);
  10498. cur = llm_build_ffn(ctx0, lctx, cur,
  10499. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10500. NULL, NULL, NULL,
  10501. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10502. NULL,
  10503. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  10504. cb(cur, "ffn_out", il);
  10505. }
  10506. cur = ggml_add(ctx0, cur, ffn_inp);
  10507. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10508. cb(cur, "l_out", il);
  10509. // input for next layer
  10510. inpL = cur;
  10511. }
  10512. cur = llm_build_norm(ctx0, inpL, hparams,
  10513. model.output_norm,
  10514. model.output_norm_b,
  10515. LLM_NORM, cb, -1);
  10516. cb(cur, "result_norm", -1);
  10517. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10518. cb(cur, "result_output", -1);
  10519. ggml_build_forward_expand(gf, cur);
  10520. return gf;
  10521. }
  10522. struct ggml_cgraph * build_codeshell() {
  10523. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10524. const int64_t n_embd_head = hparams.n_embd_head_v;
  10525. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  10526. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10527. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10528. struct ggml_tensor * cur;
  10529. struct ggml_tensor * inpL;
  10530. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10531. // inp_pos - contains the positions
  10532. struct ggml_tensor * inp_pos = build_inp_pos();
  10533. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10534. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10535. for (int il = 0; il < n_layer; ++il) {
  10536. cur = llm_build_norm(ctx0, inpL, hparams,
  10537. model.layers[il].attn_norm,
  10538. model.layers[il].attn_norm_b,
  10539. LLM_NORM, cb, il);
  10540. cb(cur, "attn_norm", il);
  10541. // self-attention
  10542. {
  10543. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  10544. cb(cur, "wqkv", il);
  10545. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  10546. cb(cur, "bqkv", il);
  10547. struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  10548. struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  10549. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  10550. cb(tmpq, "tmpq", il);
  10551. cb(tmpk, "tmpk", il);
  10552. cb(Vcur, "Vcur", il);
  10553. struct ggml_tensor * Qcur = ggml_rope_ext(
  10554. ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10555. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10556. ext_factor, attn_factor, beta_fast, beta_slow
  10557. );
  10558. cb(Qcur, "Qcur", il);
  10559. struct ggml_tensor * Kcur = ggml_rope_ext(
  10560. ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10561. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10562. ext_factor, attn_factor, beta_fast, beta_slow
  10563. );
  10564. cb(Kcur, "Kcur", il);
  10565. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10566. model.layers[il].wo, model.layers[il].bo,
  10567. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10568. }
  10569. if (il == n_layer - 1) {
  10570. // skip computing output for unused tokens
  10571. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10572. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10573. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10574. }
  10575. // add the input
  10576. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  10577. cb(ffn_inp, "ffn_inp", il);
  10578. // FF
  10579. {
  10580. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10581. model.layers[il].ffn_norm,
  10582. model.layers[il].ffn_norm_b,
  10583. LLM_NORM, cb, il);
  10584. cb(cur, "ffn_norm", il);
  10585. cur = llm_build_ffn(ctx0, lctx, cur,
  10586. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  10587. NULL, NULL, NULL,
  10588. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  10589. NULL,
  10590. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  10591. cb(cur, "ffn_out", il);
  10592. }
  10593. cur = ggml_add(ctx0, cur, ffn_inp);
  10594. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10595. cb(cur, "l_out", il);
  10596. // input for next layer
  10597. inpL = cur;
  10598. }
  10599. cur = llm_build_norm(ctx0, inpL, hparams,
  10600. model.output_norm,
  10601. model.output_norm_b,
  10602. LLM_NORM, cb, -1);
  10603. cb(cur, "result_norm", -1);
  10604. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10605. cb(cur, "result_output", -1);
  10606. ggml_build_forward_expand(gf, cur);
  10607. return gf;
  10608. }
  10609. struct ggml_cgraph * build_orion() {
  10610. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10611. const int64_t n_embd_head = hparams.n_embd_head_v;
  10612. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10613. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10614. struct ggml_tensor * cur;
  10615. struct ggml_tensor * inpL;
  10616. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10617. // inp_pos - contains the positions
  10618. struct ggml_tensor * inp_pos = build_inp_pos();
  10619. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10620. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10621. for (int il = 0; il < n_layer; ++il) {
  10622. struct ggml_tensor * inpSA = inpL;
  10623. // norm
  10624. cur = llm_build_norm(ctx0, inpL, hparams,
  10625. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  10626. LLM_NORM, cb, il);
  10627. cb(cur, "attn_norm", il);
  10628. // self-attention
  10629. {
  10630. // compute Q and K and RoPE them
  10631. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10632. cb(Qcur, "Qcur", il);
  10633. // if (model.layers[il].bq) {
  10634. // Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10635. // cb(Qcur, "Qcur", il);
  10636. // }
  10637. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10638. cb(Kcur, "Kcur", il);
  10639. // if (model.layers[il].bk) {
  10640. // Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10641. // cb(Kcur, "Kcur", il);
  10642. // }
  10643. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10644. cb(Vcur, "Vcur", il);
  10645. // if (model.layers[il].bv) {
  10646. // Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10647. // cb(Vcur, "Vcur", il);
  10648. // }
  10649. Qcur = ggml_rope_ext(
  10650. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10651. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10652. ext_factor, attn_factor, beta_fast, beta_slow
  10653. );
  10654. cb(Qcur, "Qcur", il);
  10655. Kcur = ggml_rope_ext(
  10656. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10657. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10658. ext_factor, attn_factor, beta_fast, beta_slow
  10659. );
  10660. cb(Kcur, "Kcur", il);
  10661. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10662. model.layers[il].wo, NULL,
  10663. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10664. }
  10665. if (il == n_layer - 1) {
  10666. // skip computing output for unused tokens
  10667. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10668. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10669. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10670. }
  10671. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10672. cb(ffn_inp, "ffn_inp", il);
  10673. // feed-forward network
  10674. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10675. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  10676. LLM_NORM, cb, il);
  10677. cb(cur, "ffn_norm", il);
  10678. cur = llm_build_ffn(ctx0, lctx, cur,
  10679. model.layers[il].ffn_up, NULL, NULL,
  10680. model.layers[il].ffn_gate, NULL, NULL,
  10681. model.layers[il].ffn_down, NULL, NULL,
  10682. NULL,
  10683. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10684. cb(cur, "ffn_out", il);
  10685. cur = ggml_add(ctx0, cur, ffn_inp);
  10686. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10687. cb(cur, "l_out", il);
  10688. // input for next layer
  10689. inpL = cur;
  10690. }
  10691. cur = inpL;
  10692. cur = llm_build_norm(ctx0, cur, hparams,
  10693. model.output_norm, model.output_norm_b,
  10694. LLM_NORM, cb, -1);
  10695. cb(cur, "result_norm", -1);
  10696. // lm_head
  10697. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10698. cb(cur, "result_output", -1);
  10699. ggml_build_forward_expand(gf, cur);
  10700. return gf;
  10701. }
  10702. struct ggml_cgraph * build_internlm2() {
  10703. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10704. const int64_t n_embd_head = hparams.n_embd_head_v;
  10705. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10706. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10707. struct ggml_tensor * cur;
  10708. struct ggml_tensor * inpL;
  10709. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10710. // inp_pos - contains the positions
  10711. struct ggml_tensor * inp_pos = build_inp_pos();
  10712. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10713. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10714. for (int il = 0; il < n_layer; ++il) {
  10715. struct ggml_tensor * inpSA = inpL;
  10716. // norm
  10717. cur = llm_build_norm(ctx0, inpL, hparams,
  10718. model.layers[il].attn_norm, NULL,
  10719. LLM_NORM_RMS, cb, il);
  10720. cb(cur, "attn_norm", il);
  10721. // self-attention
  10722. {
  10723. // compute Q and K and RoPE them
  10724. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10725. cb(Qcur, "Qcur", il);
  10726. if (model.layers[il].bq) {
  10727. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10728. cb(Qcur, "Qcur", il);
  10729. }
  10730. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10731. cb(Kcur, "Kcur", il);
  10732. if (model.layers[il].bk) {
  10733. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10734. cb(Kcur, "Kcur", il);
  10735. }
  10736. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10737. cb(Vcur, "Vcur", il);
  10738. if (model.layers[il].bv) {
  10739. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10740. cb(Vcur, "Vcur", il);
  10741. }
  10742. Qcur = ggml_rope_ext(
  10743. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10744. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10745. ext_factor, attn_factor, beta_fast, beta_slow
  10746. );
  10747. cb(Qcur, "Qcur", il);
  10748. Kcur = ggml_rope_ext(
  10749. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10750. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10751. ext_factor, attn_factor, beta_fast, beta_slow
  10752. );
  10753. cb(Kcur, "Kcur", il);
  10754. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10755. model.layers[il].wo, model.layers[il].bo,
  10756. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10757. }
  10758. if (il == n_layer - 1) {
  10759. // skip computing output for unused tokens
  10760. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10761. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10762. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10763. }
  10764. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10765. cb(ffn_inp, "ffn_inp", il);
  10766. // feed-forward network
  10767. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10768. model.layers[il].ffn_norm, NULL,
  10769. LLM_NORM_RMS, cb, il);
  10770. cb(cur, "ffn_norm", il);
  10771. cur = llm_build_ffn(ctx0, lctx, cur,
  10772. model.layers[il].ffn_up, NULL, NULL,
  10773. model.layers[il].ffn_gate, NULL, NULL,
  10774. model.layers[il].ffn_down, NULL, NULL,
  10775. NULL,
  10776. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10777. cb(cur, "ffn_out", il);
  10778. cur = ggml_add(ctx0, cur, ffn_inp);
  10779. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10780. cb(cur, "l_out", il);
  10781. // input for next layer
  10782. inpL = cur;
  10783. }
  10784. cur = inpL;
  10785. cur = llm_build_norm(ctx0, cur, hparams,
  10786. model.output_norm, NULL,
  10787. LLM_NORM_RMS, cb, -1);
  10788. cb(cur, "result_norm", -1);
  10789. // lm_head
  10790. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10791. cb(cur, "result_output", -1);
  10792. ggml_build_forward_expand(gf, cur);
  10793. return gf;
  10794. }
  10795. // ref: https://arxiv.org/abs/2203.03466
  10796. // https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
  10797. // based on the original build_llama() function
  10798. struct ggml_cgraph * build_minicpm() {
  10799. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10800. const int64_t n_embd_head = hparams.n_embd_head_v;
  10801. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  10802. GGML_ASSERT(n_embd_head == hparams.n_rot);
  10803. const int64_t n_embd = hparams.n_embd;
  10804. //TODO: if the model varies, these parameters need to be read from the model
  10805. const int64_t n_embd_base = 256;
  10806. const float scale_embd = 12.0f;
  10807. const float scale_depth = 1.4f;
  10808. struct ggml_tensor * cur;
  10809. struct ggml_tensor * inpL;
  10810. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10811. // scale the input embeddings
  10812. inpL = ggml_scale(ctx0, inpL, scale_embd);
  10813. cb(inpL, "inp_scaled", -1);
  10814. // inp_pos - contains the positions
  10815. struct ggml_tensor * inp_pos = build_inp_pos();
  10816. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10817. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10818. for (int il = 0; il < n_layer; ++il) {
  10819. struct ggml_tensor * inpSA = inpL;
  10820. // norm
  10821. cur = llm_build_norm(ctx0, inpL, hparams,
  10822. model.layers[il].attn_norm, NULL,
  10823. LLM_NORM_RMS, cb, il);
  10824. cb(cur, "attn_norm", il);
  10825. // self-attention
  10826. {
  10827. // compute Q and K and RoPE them
  10828. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10829. cb(Qcur, "Qcur", il);
  10830. if (model.layers[il].bq) {
  10831. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  10832. cb(Qcur, "Qcur", il);
  10833. }
  10834. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10835. cb(Kcur, "Kcur", il);
  10836. if (model.layers[il].bk) {
  10837. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  10838. cb(Kcur, "Kcur", il);
  10839. }
  10840. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10841. cb(Vcur, "Vcur", il);
  10842. if (model.layers[il].bv) {
  10843. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  10844. cb(Vcur, "Vcur", il);
  10845. }
  10846. Qcur = ggml_rope_ext(
  10847. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  10848. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10849. ext_factor, attn_factor, beta_fast, beta_slow
  10850. );
  10851. cb(Qcur, "Qcur", il);
  10852. Kcur = ggml_rope_ext(
  10853. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  10854. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10855. ext_factor, attn_factor, beta_fast, beta_slow
  10856. );
  10857. cb(Kcur, "Kcur", il);
  10858. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10859. model.layers[il].wo, model.layers[il].bo,
  10860. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  10861. }
  10862. if (il == n_layer - 1) {
  10863. // skip computing output for unused tokens
  10864. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10865. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10866. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  10867. }
  10868. // scale_res - scale the hidden states for residual connection
  10869. const float scale_res = scale_depth/sqrtf(float(n_layer));
  10870. cur = ggml_scale(ctx0, cur, scale_res);
  10871. cb(cur, "hidden_scaled", -1);
  10872. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  10873. cb(ffn_inp, "ffn_inp", il);
  10874. // feed-forward network
  10875. {
  10876. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  10877. model.layers[il].ffn_norm, NULL,
  10878. LLM_NORM_RMS, cb, il);
  10879. cb(cur, "ffn_norm", il);
  10880. cur = llm_build_ffn(ctx0, lctx, cur,
  10881. model.layers[il].ffn_up, NULL, NULL,
  10882. model.layers[il].ffn_gate, NULL, NULL,
  10883. model.layers[il].ffn_down, NULL, NULL,
  10884. NULL,
  10885. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  10886. cb(cur, "ffn_out", il);
  10887. }
  10888. // scale the hidden states for residual connection
  10889. cur = ggml_scale(ctx0, cur, scale_res);
  10890. cb(cur, "hidden_scaled_ffn", -1);
  10891. cur = ggml_add(ctx0, cur, ffn_inp);
  10892. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10893. cb(cur, "l_out", il);
  10894. // input for next layer
  10895. inpL = cur;
  10896. }
  10897. cur = inpL;
  10898. cur = llm_build_norm(ctx0, cur, hparams,
  10899. model.output_norm, NULL,
  10900. LLM_NORM_RMS, cb, -1);
  10901. cb(cur, "result_norm", -1);
  10902. // lm_head scaling
  10903. const float scale_lmhead = float(n_embd_base)/float(n_embd);
  10904. cur = ggml_scale(ctx0, cur, scale_lmhead);
  10905. cb(cur, "lmhead_scaling", -1);
  10906. // lm_head
  10907. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10908. cb(cur, "result_output", -1);
  10909. ggml_build_forward_expand(gf, cur);
  10910. return gf;
  10911. }
  10912. struct ggml_cgraph * build_gemma() {
  10913. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10914. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  10915. struct ggml_tensor * cur;
  10916. struct ggml_tensor * inpL;
  10917. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  10918. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  10919. cb(inpL, "inp_scaled", -1);
  10920. // inp_pos - contains the positions
  10921. struct ggml_tensor * inp_pos = build_inp_pos();
  10922. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  10923. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  10924. for (int il = 0; il < n_layer; ++il) {
  10925. // norm
  10926. cur = llm_build_norm(ctx0, inpL, hparams,
  10927. model.layers[il].attn_norm, NULL,
  10928. LLM_NORM_RMS, cb, il);
  10929. cb(cur, "attn_norm", il);
  10930. // self-attention
  10931. {
  10932. // compute Q and K and RoPE them
  10933. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  10934. cb(Qcur, "Qcur", il);
  10935. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  10936. cb(Kcur, "Kcur", il);
  10937. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  10938. cb(Vcur, "Vcur", il);
  10939. Qcur = ggml_rope_ext(
  10940. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
  10941. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10942. ext_factor, attn_factor, beta_fast, beta_slow);
  10943. cb(Qcur, "Qcur", il);
  10944. Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
  10945. cb(Qcur, "Qcur_scaled", il);
  10946. Kcur = ggml_rope_ext(
  10947. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
  10948. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  10949. ext_factor, attn_factor, beta_fast, beta_slow);
  10950. cb(Kcur, "Kcur", il);
  10951. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  10952. model.layers[il].wo, NULL,
  10953. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  10954. }
  10955. if (il == n_layer - 1) {
  10956. // skip computing output for unused tokens
  10957. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  10958. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  10959. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  10960. }
  10961. struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  10962. cb(sa_out, "sa_out", il);
  10963. cur = llm_build_norm(ctx0, sa_out, hparams,
  10964. model.layers[il].ffn_norm, NULL,
  10965. LLM_NORM_RMS, cb, il);
  10966. cb(cur, "ffn_norm", il);
  10967. // feed-forward network
  10968. {
  10969. cur = llm_build_ffn(ctx0, lctx, cur,
  10970. model.layers[il].ffn_up, NULL, NULL,
  10971. model.layers[il].ffn_gate, NULL, NULL,
  10972. model.layers[il].ffn_down, NULL, NULL,
  10973. NULL,
  10974. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  10975. cb(cur, "ffn_out", il);
  10976. }
  10977. cur = ggml_add(ctx0, cur, sa_out);
  10978. cur = lctx.cvec.apply_to(ctx0, cur, il);
  10979. cb(cur, "l_out", il);
  10980. // input for next layer
  10981. inpL = cur;
  10982. }
  10983. cur = inpL;
  10984. cur = llm_build_norm(ctx0, cur, hparams,
  10985. model.output_norm, NULL,
  10986. LLM_NORM_RMS, cb, -1);
  10987. cb(cur, "result_norm", -1);
  10988. // lm_head
  10989. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  10990. cb(cur, "result_output", -1);
  10991. ggml_build_forward_expand(gf, cur);
  10992. return gf;
  10993. }
  10994. struct ggml_cgraph * build_gemma2() {
  10995. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  10996. const int64_t n_embd_head_k = hparams.n_embd_head_k;
  10997. struct ggml_tensor * cur;
  10998. struct ggml_tensor * inpL;
  10999. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11000. inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
  11001. cb(inpL, "inp_scaled", -1);
  11002. // inp_pos - contains the positions
  11003. struct ggml_tensor * inp_pos = build_inp_pos();
  11004. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11005. // gemma 2 requires different mask for layers using sliding window (SWA)
  11006. struct ggml_tensor * KQ_mask = build_inp_KQ_mask(true);
  11007. struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(true);
  11008. for (int il = 0; il < n_layer; ++il) {
  11009. // (il % 2) layers use SWA
  11010. struct ggml_tensor * KQ_mask_l = (il % 2 == 0) ? KQ_mask_swa : KQ_mask;
  11011. // norm
  11012. cur = llm_build_norm(ctx0, inpL, hparams,
  11013. model.layers[il].attn_norm, NULL,
  11014. LLM_NORM_RMS, cb, il);
  11015. cb(cur, "attn_norm", il);
  11016. // self-attention
  11017. {
  11018. // compute Q and K and RoPE them
  11019. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11020. cb(Qcur, "Qcur", il);
  11021. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11022. cb(Kcur, "Kcur", il);
  11023. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11024. cb(Vcur, "Vcur", il);
  11025. Qcur = ggml_rope_ext(
  11026. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
  11027. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11028. ext_factor, attn_factor, beta_fast, beta_slow);
  11029. cb(Qcur, "Qcur", il);
  11030. // ref: https://github.com/google/gemma_pytorch/commit/03e657582d17cb5a8617ebf333c1c16f3694670e
  11031. switch (model.type) {
  11032. case e_model::MODEL_2B:
  11033. case e_model::MODEL_9B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); break;
  11034. case e_model::MODEL_27B: Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); break;
  11035. default: GGML_ABORT("fatal error");
  11036. };
  11037. cb(Qcur, "Qcur_scaled", il);
  11038. Kcur = ggml_rope_ext(
  11039. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
  11040. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11041. ext_factor, attn_factor, beta_fast, beta_slow);
  11042. cb(Kcur, "Kcur", il);
  11043. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11044. model.layers[il].wo, NULL,
  11045. Kcur, Vcur, Qcur, KQ_mask_l, n_tokens, kv_head, n_kv, 1.0f, cb, il);
  11046. }
  11047. cur = llm_build_norm(ctx0, cur, hparams,
  11048. model.layers[il].attn_post_norm, NULL,
  11049. LLM_NORM_RMS, cb, il);
  11050. cb(cur, "attn_post_norm", il);
  11051. if (il == n_layer - 1) {
  11052. // skip computing output for unused tokens
  11053. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11054. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11055. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  11056. }
  11057. struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
  11058. cb(sa_out, "sa_out", il);
  11059. cur = llm_build_norm(ctx0, sa_out, hparams,
  11060. model.layers[il].ffn_norm, NULL,
  11061. LLM_NORM_RMS, cb, il);
  11062. cb(cur, "ffn_norm", il);
  11063. // feed-forward network
  11064. {
  11065. cur = llm_build_ffn(ctx0, lctx, cur,
  11066. model.layers[il].ffn_up, NULL, NULL,
  11067. model.layers[il].ffn_gate, NULL, NULL,
  11068. model.layers[il].ffn_down, NULL, NULL,
  11069. NULL,
  11070. LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
  11071. cb(cur, "ffn_out", il);
  11072. }
  11073. cur = llm_build_norm(ctx0, cur, hparams,
  11074. model.layers[il].ffn_post_norm, NULL,
  11075. LLM_NORM_RMS, cb, -1);
  11076. cb(cur, "ffn_post_norm", -1);
  11077. cur = ggml_add(ctx0, cur, sa_out);
  11078. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11079. cb(cur, "l_out", il);
  11080. // input for next layer
  11081. inpL = cur;
  11082. }
  11083. cur = inpL;
  11084. cur = llm_build_norm(ctx0, cur, hparams,
  11085. model.output_norm, NULL,
  11086. LLM_NORM_RMS, cb, -1);
  11087. cb(cur, "result_norm", -1);
  11088. // lm_head
  11089. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11090. // final logit soft-capping
  11091. cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
  11092. cur = ggml_tanh(ctx0, cur);
  11093. cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
  11094. cb(cur, "result_output", -1);
  11095. ggml_build_forward_expand(gf, cur);
  11096. return gf;
  11097. }
  11098. struct ggml_cgraph * build_starcoder2() {
  11099. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11100. const int64_t n_embd_head = hparams.n_embd_head_v;
  11101. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11102. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11103. struct ggml_tensor * cur;
  11104. struct ggml_tensor * inpL;
  11105. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11106. // inp_pos - contains the positions
  11107. struct ggml_tensor * inp_pos = build_inp_pos();
  11108. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11109. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11110. for (int il = 0; il < n_layer; ++il) {
  11111. struct ggml_tensor * inpSA = inpL;
  11112. // norm
  11113. cur = llm_build_norm(ctx0, inpL, hparams,
  11114. model.layers[il].attn_norm, model.layers[il].attn_norm_b,
  11115. LLM_NORM, cb, il);
  11116. cb(cur, "attn_norm", il);
  11117. // self-attention
  11118. {
  11119. // compute Q and K and RoPE them
  11120. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11121. cb(Qcur, "Qcur", il);
  11122. if (model.layers[il].bq) {
  11123. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11124. cb(Qcur, "Qcur", il);
  11125. }
  11126. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11127. cb(Kcur, "Kcur", il);
  11128. if (model.layers[il].bk) {
  11129. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11130. cb(Kcur, "Kcur", il);
  11131. }
  11132. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11133. cb(Vcur, "Vcur", il);
  11134. if (model.layers[il].bv) {
  11135. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11136. cb(Vcur, "Vcur", il);
  11137. }
  11138. Qcur = ggml_rope_ext(
  11139. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11140. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11141. ext_factor, attn_factor, beta_fast, beta_slow
  11142. );
  11143. cb(Qcur, "Qcur", il);
  11144. Kcur = ggml_rope_ext(
  11145. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11146. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11147. ext_factor, attn_factor, beta_fast, beta_slow
  11148. );
  11149. cb(Kcur, "Kcur", il);
  11150. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11151. model.layers[il].wo, model.layers[il].bo,
  11152. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11153. }
  11154. if (il == n_layer - 1) {
  11155. // skip computing output for unused tokens
  11156. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11157. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11158. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11159. }
  11160. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11161. cb(ffn_inp, "ffn_inp", il);
  11162. // feed-forward network
  11163. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11164. model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  11165. LLM_NORM, cb, il);
  11166. cb(cur, "ffn_norm", il);
  11167. cur = llm_build_ffn(ctx0, lctx, cur,
  11168. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11169. NULL, NULL, NULL,
  11170. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11171. NULL,
  11172. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  11173. cb(cur, "ffn_out", il);
  11174. cur = ggml_add(ctx0, cur, ffn_inp);
  11175. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11176. cb(cur, "l_out", il);
  11177. // input for next layer
  11178. inpL = cur;
  11179. }
  11180. cur = inpL;
  11181. cur = llm_build_norm(ctx0, cur, hparams,
  11182. model.output_norm, model.output_norm_b,
  11183. LLM_NORM, cb, -1);
  11184. cb(cur, "result_norm", -1);
  11185. // lm_head
  11186. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11187. cb(cur, "result_output", -1);
  11188. ggml_build_forward_expand(gf, cur);
  11189. return gf;
  11190. }
  11191. struct ggml_cgraph * build_mamba() {
  11192. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11193. struct ggml_tensor * cur;
  11194. struct ggml_tensor * inpL;
  11195. // {n_embd, n_tokens}
  11196. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11197. struct ggml_tensor * state_copy = build_inp_s_copy();
  11198. struct ggml_tensor * state_mask = build_inp_s_mask();
  11199. for (int il = 0; il < n_layer; ++il) {
  11200. // norm
  11201. cur = llm_build_norm(ctx0, inpL, hparams,
  11202. model.layers[il].attn_norm, NULL,
  11203. LLM_NORM_RMS, cb, il);
  11204. cb(cur, "attn_norm", il);
  11205. cur = llm_build_mamba(ctx0, lctx, batch, gf, cur,
  11206. state_copy, state_mask,
  11207. kv_head, n_kv, cb, il);
  11208. if (il == n_layer - 1) {
  11209. // skip computing output for unused tokens
  11210. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11211. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11212. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  11213. }
  11214. // residual
  11215. cur = ggml_add(ctx0, cur, inpL);
  11216. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11217. cb(cur, "l_out", il);
  11218. // input for next layer
  11219. inpL = cur;
  11220. }
  11221. // final rmsnorm
  11222. cur = llm_build_norm(ctx0, inpL, hparams,
  11223. model.output_norm, NULL,
  11224. LLM_NORM_RMS, cb, -1);
  11225. cb(cur, "result_norm", -1);
  11226. // lm_head
  11227. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11228. cb(cur, "result_output", -1);
  11229. ggml_build_forward_expand(gf, cur);
  11230. return gf;
  11231. }
  11232. struct ggml_cgraph * build_command_r() {
  11233. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11234. const int64_t n_embd_head = hparams.n_embd_head_v;
  11235. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11236. const float f_logit_scale = hparams.f_logit_scale;
  11237. struct ggml_tensor * cur;
  11238. struct ggml_tensor * inpL;
  11239. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11240. // inp_pos - contains the positions
  11241. struct ggml_tensor * inp_pos = build_inp_pos();
  11242. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11243. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11244. for (int il = 0; il < n_layer; ++il) {
  11245. // norm
  11246. cur = llm_build_norm(ctx0, inpL, hparams,
  11247. model.layers[il].attn_norm, NULL,
  11248. LLM_NORM, cb, il);
  11249. cb(cur, "attn_norm", il);
  11250. struct ggml_tensor * ffn_inp = cur;
  11251. // self-attention
  11252. {
  11253. // compute Q and K and RoPE them
  11254. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11255. cb(Qcur, "Qcur", il);
  11256. if (model.layers[il].bq) {
  11257. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11258. cb(Qcur, "Qcur", il);
  11259. }
  11260. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11261. cb(Kcur, "Kcur", il);
  11262. if (model.layers[il].bk) {
  11263. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11264. cb(Kcur, "Kcur", il);
  11265. }
  11266. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11267. cb(Vcur, "Vcur", il);
  11268. if (model.layers[il].bv) {
  11269. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11270. cb(Vcur, "Vcur", il);
  11271. }
  11272. if (model.layers[il].attn_q_norm) {
  11273. Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
  11274. ggml_element_size(Qcur) * n_embd_head,
  11275. ggml_element_size(Qcur) * n_embd_head * n_head,
  11276. 0);
  11277. cb(Qcur, "Qcur", il);
  11278. Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
  11279. ggml_element_size(Kcur) * n_embd_head,
  11280. ggml_element_size(Kcur) * n_embd_head * n_head_kv,
  11281. 0);
  11282. cb(Kcur, "Kcur", il);
  11283. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  11284. model.layers[il].attn_q_norm,
  11285. NULL,
  11286. LLM_NORM, cb, il);
  11287. cb(Qcur, "Qcur", il);
  11288. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  11289. model.layers[il].attn_k_norm,
  11290. NULL,
  11291. LLM_NORM, cb, il);
  11292. cb(Kcur, "Kcur", il);
  11293. }
  11294. Qcur = ggml_rope_ext(
  11295. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11296. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11297. ext_factor, attn_factor, beta_fast, beta_slow
  11298. );
  11299. cb(Qcur, "Qcur", il);
  11300. Kcur = ggml_rope_ext(
  11301. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11302. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11303. ext_factor, attn_factor, beta_fast, beta_slow
  11304. );
  11305. cb(Kcur, "Kcur", il);
  11306. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11307. model.layers[il].wo, model.layers[il].bo,
  11308. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11309. }
  11310. if (il == n_layer - 1) {
  11311. // skip computing output for unused tokens
  11312. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11313. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11314. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  11315. ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
  11316. }
  11317. struct ggml_tensor * attn_out = cur;
  11318. // feed-forward network
  11319. {
  11320. cur = llm_build_ffn(ctx0, lctx, ffn_inp,
  11321. model.layers[il].ffn_up, NULL, NULL,
  11322. model.layers[il].ffn_gate, NULL, NULL,
  11323. model.layers[il].ffn_down, NULL, NULL,
  11324. NULL,
  11325. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11326. cb(cur, "ffn_out", il);
  11327. }
  11328. // add together residual + FFN + self-attention
  11329. cur = ggml_add(ctx0, cur, inpL);
  11330. cur = ggml_add(ctx0, cur, attn_out);
  11331. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11332. cb(cur, "l_out", il);
  11333. // input for next layer
  11334. inpL = cur;
  11335. }
  11336. cur = inpL;
  11337. cur = llm_build_norm(ctx0, cur, hparams,
  11338. model.output_norm, NULL,
  11339. LLM_NORM, cb, -1);
  11340. cb(cur, "result_norm", -1);
  11341. // lm_head
  11342. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11343. if (f_logit_scale) {
  11344. cur = ggml_scale(ctx0, cur, f_logit_scale);
  11345. }
  11346. cb(cur, "result_output", -1);
  11347. ggml_build_forward_expand(gf, cur);
  11348. return gf;
  11349. }
  11350. // ref: https://allenai.org/olmo
  11351. // based on the original build_llama() function, changes:
  11352. // * non-parametric layer norm
  11353. // * clamp qkv
  11354. // * removed bias
  11355. // * removed MoE
  11356. struct ggml_cgraph * build_olmo() {
  11357. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11358. // mutable variable, needed during the last layer of the computation to skip unused tokens
  11359. int32_t n_tokens = this->n_tokens;
  11360. const int64_t n_embd_head = hparams.n_embd_head_v;
  11361. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11362. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11363. struct ggml_tensor * cur;
  11364. struct ggml_tensor * inpL;
  11365. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11366. // inp_pos - contains the positions
  11367. struct ggml_tensor * inp_pos = build_inp_pos();
  11368. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11369. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11370. for (int il = 0; il < n_layer; ++il) {
  11371. struct ggml_tensor * inpSA = inpL;
  11372. // norm
  11373. cur = llm_build_norm(ctx0, inpL, hparams,
  11374. NULL, NULL,
  11375. LLM_NORM, cb, il);
  11376. cb(cur, "attn_norm", il);
  11377. // self-attention
  11378. {
  11379. // compute Q and K and RoPE them
  11380. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11381. cb(Qcur, "Qcur", il);
  11382. if (hparams.f_clamp_kqv > 0.0f) {
  11383. Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  11384. cb(Qcur, "Qcur", il);
  11385. }
  11386. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11387. cb(Kcur, "Kcur", il);
  11388. if (hparams.f_clamp_kqv > 0.0f) {
  11389. Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  11390. cb(Kcur, "Kcur", il);
  11391. }
  11392. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11393. cb(Vcur, "Vcur", il);
  11394. if (hparams.f_clamp_kqv > 0.0f) {
  11395. Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
  11396. cb(Vcur, "Vcur", il);
  11397. }
  11398. Qcur = ggml_rope_ext(
  11399. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11400. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11401. ext_factor, attn_factor, beta_fast, beta_slow
  11402. );
  11403. cb(Qcur, "Qcur", il);
  11404. Kcur = ggml_rope_ext(
  11405. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11406. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11407. ext_factor, attn_factor, beta_fast, beta_slow
  11408. );
  11409. cb(Kcur, "Kcur", il);
  11410. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11411. model.layers[il].wo, nullptr,
  11412. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11413. }
  11414. if (il == n_layer - 1) {
  11415. // skip computing output for unused tokens
  11416. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11417. n_tokens = n_outputs;
  11418. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11419. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11420. }
  11421. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11422. cb(ffn_inp, "ffn_inp", il);
  11423. // feed-forward network
  11424. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11425. NULL, NULL,
  11426. LLM_NORM, cb, il);
  11427. cb(cur, "ffn_norm", il);
  11428. cur = llm_build_ffn(ctx0, lctx, cur,
  11429. model.layers[il].ffn_up, NULL, NULL,
  11430. model.layers[il].ffn_gate, NULL, NULL,
  11431. model.layers[il].ffn_down, NULL, NULL,
  11432. NULL,
  11433. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11434. cb(cur, "ffn_out", il);
  11435. cur = ggml_add(ctx0, cur, ffn_inp);
  11436. cb(cur, "ffn_out", il);
  11437. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11438. cb(cur, "l_out", il);
  11439. // input for next layer
  11440. inpL = cur;
  11441. }
  11442. cur = inpL;
  11443. cur = llm_build_norm(ctx0, cur, hparams,
  11444. NULL, NULL,
  11445. LLM_NORM, cb, -1);
  11446. cb(cur, "result_norm", -1);
  11447. // lm_head
  11448. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11449. cb(cur, "result_output", -1);
  11450. ggml_build_forward_expand(gf, cur);
  11451. return gf;
  11452. }
  11453. struct ggml_cgraph * build_openelm() {
  11454. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11455. const int64_t n_embd_head = hparams.n_embd_head_v;
  11456. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11457. struct ggml_tensor * cur;
  11458. struct ggml_tensor * inpL;
  11459. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11460. // inp_pos - contains the positions
  11461. struct ggml_tensor * inp_pos = build_inp_pos();
  11462. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11463. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11464. for (int il = 0; il < n_layer; ++il) {
  11465. const int64_t n_head = hparams.n_head(il);
  11466. const int64_t n_head_kv = hparams.n_head_kv(il);
  11467. const int64_t n_head_qkv = 2*n_head_kv + n_head;
  11468. cur = inpL;
  11469. struct ggml_tensor * residual = cur;
  11470. // norm
  11471. cur = llm_build_norm(ctx0, inpL, hparams,
  11472. model.layers[il].attn_norm, NULL,
  11473. LLM_NORM_RMS, cb, il);
  11474. cb(cur, "attn_norm", il);
  11475. // self-attention
  11476. {
  11477. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  11478. cb(cur, "wqkv", il);
  11479. cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
  11480. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
  11481. cb(Qcur, "Qcur", il);
  11482. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
  11483. cb(Kcur, "Kcur", il);
  11484. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
  11485. cb(Vcur, "Vcur", il);
  11486. Qcur = llm_build_norm(ctx0, Qcur, hparams,
  11487. model.layers[il].attn_q_norm, NULL,
  11488. LLM_NORM_RMS, cb, il);
  11489. cb(Qcur, "Qcur", il);
  11490. Kcur = llm_build_norm(ctx0, Kcur, hparams,
  11491. model.layers[il].attn_k_norm, NULL,
  11492. LLM_NORM_RMS, cb, il);
  11493. cb(Kcur, "Kcur", il);
  11494. Qcur = ggml_rope_ext(
  11495. ctx0, Qcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
  11496. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  11497. );
  11498. cb(Qcur, "Qcur", il);
  11499. Kcur = ggml_rope_ext(
  11500. ctx0, Kcur, inp_pos, NULL, n_rot, rope_type, n_ctx_orig,
  11501. freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
  11502. );
  11503. cb(Kcur, "Kcur", il);
  11504. Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
  11505. cb(Qcur, "Vcur", il);
  11506. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11507. model.layers[il].wo, NULL,
  11508. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11509. }
  11510. if (il == n_layer - 1) {
  11511. // skip computing output for unused tokens
  11512. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11513. residual = ggml_get_rows(ctx0, residual, inp_out_ids);
  11514. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11515. }
  11516. struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
  11517. cb(ffn_inp, "ffn_inp", il);
  11518. // feed-forward network
  11519. {
  11520. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11521. model.layers[il].ffn_norm, NULL,
  11522. LLM_NORM_RMS, cb, il);
  11523. cb(cur, "ffn_norm", il);
  11524. cur = llm_build_ffn(ctx0, lctx, cur,
  11525. model.layers[il].ffn_up, NULL, NULL,
  11526. model.layers[il].ffn_gate, NULL, NULL,
  11527. model.layers[il].ffn_down, NULL, NULL,
  11528. NULL,
  11529. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11530. cb(cur, "ffn_out", il);
  11531. }
  11532. cur = ggml_add(ctx0, cur, ffn_inp);
  11533. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11534. cb(cur, "l_out", il);
  11535. inpL = cur;
  11536. }
  11537. cur = inpL;
  11538. // norm
  11539. cur = llm_build_norm(ctx0, cur, hparams,
  11540. model.output_norm, NULL,
  11541. LLM_NORM_RMS, cb, -1);
  11542. cb(cur, "result_norm", -1);
  11543. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11544. cb(cur, "result_output", -1);
  11545. ggml_build_forward_expand(gf, cur);
  11546. return gf;
  11547. }
  11548. struct ggml_cgraph * build_gptneox() {
  11549. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11550. const int64_t n_embd_head = hparams.n_embd_head_v;
  11551. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  11552. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11553. struct ggml_tensor * cur;
  11554. struct ggml_tensor * inpL;
  11555. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11556. // inp_pos - contains the positions
  11557. struct ggml_tensor * inp_pos = build_inp_pos();
  11558. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11559. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11560. for (int il = 0; il < n_layer; ++il) {
  11561. cur = llm_build_norm(ctx0, inpL, hparams,
  11562. model.layers[il].attn_norm,
  11563. model.layers[il].attn_norm_b,
  11564. LLM_NORM, cb, il);
  11565. cb(cur, "attn_norm", il);
  11566. // self-attention
  11567. {
  11568. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  11569. cb(cur, "wqkv", il);
  11570. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  11571. cb(cur, "bqkv", il);
  11572. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  11573. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  11574. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  11575. cb(Qcur, "Qcur", il);
  11576. cb(Kcur, "Kcur", il);
  11577. cb(Vcur, "Vcur", il);
  11578. Qcur = ggml_rope_ext(
  11579. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11580. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11581. ext_factor, attn_factor, beta_fast, beta_slow
  11582. );
  11583. cb(Qcur, "Qcur", il);
  11584. Kcur = ggml_rope_ext(
  11585. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11586. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11587. ext_factor, attn_factor, beta_fast, beta_slow
  11588. );
  11589. cb(Kcur, "Kcur", il);
  11590. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11591. model.layers[il].wo, model.layers[il].bo,
  11592. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11593. }
  11594. if (il == n_layer - 1) {
  11595. // skip computing output for unused tokens
  11596. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11597. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11598. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  11599. }
  11600. // ffn
  11601. if (hparams.use_par_res) {
  11602. // attention and ffn are computed in parallel
  11603. // x = x + attn(ln1(x)) + ffn(ln2(x))
  11604. struct ggml_tensor * attn_out = cur;
  11605. cur = llm_build_norm(ctx0, inpL, hparams,
  11606. model.layers[il].ffn_norm,
  11607. model.layers[il].ffn_norm_b,
  11608. LLM_NORM, cb, il);
  11609. cb(cur, "ffn_norm", il);
  11610. cur = llm_build_ffn(ctx0, lctx, cur,
  11611. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11612. NULL, NULL, NULL,
  11613. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11614. NULL,
  11615. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  11616. cb(cur, "ffn_out", il);
  11617. cur = ggml_add(ctx0, cur, inpL);
  11618. cb(cur, "ffn_out", il);
  11619. cur = ggml_add(ctx0, cur, attn_out);
  11620. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11621. cb(cur, "l_out", il);
  11622. // input for next layer
  11623. inpL = cur;
  11624. } else {
  11625. // attention and ffn are computed sequentially
  11626. // x = x + attn(ln1(x))
  11627. // x = x + ffn(ln2(x))
  11628. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  11629. cb(ffn_inp, "ffn_inp", il);
  11630. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11631. model.layers[il].ffn_norm,
  11632. model.layers[il].ffn_norm_b,
  11633. LLM_NORM, cb, il);
  11634. cb(cur, "ffn_norm", il);
  11635. cur = llm_build_ffn(ctx0, lctx, cur,
  11636. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  11637. NULL, NULL, NULL,
  11638. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  11639. NULL,
  11640. LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
  11641. cb(cur, "ffn_out", il);
  11642. cur = ggml_add(ctx0, cur, ffn_inp);
  11643. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11644. cb(cur, "l_out", il);
  11645. // input for next layer
  11646. inpL = cur;
  11647. }
  11648. }
  11649. cur = llm_build_norm(ctx0, inpL, hparams,
  11650. model.output_norm,
  11651. model.output_norm_b,
  11652. LLM_NORM, cb, -1);
  11653. cb(cur, "result_norm", -1);
  11654. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11655. cb(cur, "result_output", -1);
  11656. ggml_build_forward_expand(gf, cur);
  11657. return gf;
  11658. }
  11659. struct ggml_cgraph * build_arctic() {
  11660. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11661. // mutable variable, needed during the last layer of the computation to skip unused tokens
  11662. int32_t n_tokens = this->n_tokens;
  11663. const int64_t n_embd_head = hparams.n_embd_head_v;
  11664. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11665. GGML_ASSERT(n_embd_head == hparams.n_rot);
  11666. struct ggml_tensor * cur;
  11667. struct ggml_tensor * inpL;
  11668. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11669. // inp_pos - contains the positions
  11670. struct ggml_tensor * inp_pos = build_inp_pos();
  11671. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11672. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11673. for (int il = 0; il < n_layer; ++il) {
  11674. struct ggml_tensor * inpSA = inpL;
  11675. // norm
  11676. cur = llm_build_norm(ctx0, inpL, hparams,
  11677. model.layers[il].attn_norm, NULL,
  11678. LLM_NORM_RMS, cb, il);
  11679. cb(cur, "attn_norm", il);
  11680. // self-attention
  11681. {
  11682. // compute Q and K and RoPE them
  11683. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11684. cb(Qcur, "Qcur", il);
  11685. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11686. cb(Kcur, "Kcur", il);
  11687. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11688. cb(Vcur, "Vcur", il);
  11689. Qcur = ggml_rope_ext(
  11690. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11691. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11692. ext_factor, attn_factor, beta_fast, beta_slow
  11693. );
  11694. cb(Qcur, "Qcur", il);
  11695. Kcur = ggml_rope_ext(
  11696. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11697. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11698. ext_factor, attn_factor, beta_fast, beta_slow
  11699. );
  11700. cb(Kcur, "Kcur", il);
  11701. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11702. model.layers[il].wo, NULL,
  11703. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  11704. }
  11705. if (il == n_layer - 1) {
  11706. // skip computing output for unused tokens
  11707. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11708. n_tokens = n_outputs;
  11709. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11710. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11711. }
  11712. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11713. cb(ffn_inp, "ffn_inp", il);
  11714. // feed-forward network
  11715. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11716. model.layers[il].ffn_norm, NULL,
  11717. LLM_NORM_RMS, cb, il);
  11718. cb(cur, "ffn_norm", il);
  11719. cur = llm_build_ffn(ctx0, lctx, cur,
  11720. model.layers[il].ffn_up, NULL, NULL,
  11721. model.layers[il].ffn_gate, NULL, NULL,
  11722. model.layers[il].ffn_down, NULL, NULL,
  11723. NULL,
  11724. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11725. cb(cur, "ffn_out", il);
  11726. struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
  11727. cb(ffn_out, "ffn_out", il);
  11728. // MoE
  11729. cur = llm_build_norm(ctx0, inpSA, hparams,
  11730. model.layers[il].ffn_norm_exps, NULL,
  11731. LLM_NORM_RMS, cb, il);
  11732. cb(cur, "ffn_norm_exps", il);
  11733. cur = llm_build_moe_ffn(ctx0, lctx, cur,
  11734. model.layers[il].ffn_gate_inp,
  11735. model.layers[il].ffn_up_exps,
  11736. model.layers[il].ffn_gate_exps,
  11737. model.layers[il].ffn_down_exps,
  11738. n_expert, n_expert_used,
  11739. LLM_FFN_SILU, true,
  11740. false, 0.0,
  11741. cb, il);
  11742. cb(cur, "ffn_moe_out", il);
  11743. cur = ggml_add(ctx0, cur, ffn_out);
  11744. cb(cur, "ffn_out", il);
  11745. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11746. cb(cur, "l_out", il);
  11747. // input for next layer
  11748. inpL = cur;
  11749. }
  11750. cur = inpL;
  11751. cur = llm_build_norm(ctx0, cur, hparams,
  11752. model.output_norm, NULL,
  11753. LLM_NORM_RMS, cb, -1);
  11754. cb(cur, "result_norm", -1);
  11755. // lm_head
  11756. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  11757. cb(cur, "result_output", -1);
  11758. ggml_build_forward_expand(gf, cur);
  11759. return gf;
  11760. }
  11761. struct ggml_cgraph * build_deepseek2() {
  11762. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11763. // mutable variable, needed during the last layer of the computation to skip unused tokens
  11764. int32_t n_tokens = this->n_tokens;
  11765. bool is_lite = (hparams.n_layer == 27);
  11766. // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
  11767. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
  11768. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
  11769. const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
  11770. const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
  11771. const uint32_t n_embd_head_qk_rope = hparams.n_rot;
  11772. const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
  11773. const uint32_t kv_lora_rank = hparams.n_lora_kv;
  11774. struct ggml_tensor * cur;
  11775. struct ggml_tensor * inpL;
  11776. // {n_embd, n_tokens}
  11777. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11778. // inp_pos - contains the positions
  11779. struct ggml_tensor * inp_pos = build_inp_pos();
  11780. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11781. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11782. for (int il = 0; il < n_layer; ++il) {
  11783. struct ggml_tensor * inpSA = inpL;
  11784. // norm
  11785. cur = llm_build_norm(ctx0, inpL, hparams,
  11786. model.layers[il].attn_norm, NULL,
  11787. LLM_NORM_RMS, cb, il);
  11788. cb(cur, "attn_norm", il);
  11789. // self_attention
  11790. {
  11791. struct ggml_tensor * q = NULL;
  11792. if (!is_lite) {
  11793. // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
  11794. q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
  11795. cb(q, "q", il);
  11796. q = llm_build_norm(ctx0, q, hparams,
  11797. model.layers[il].attn_q_a_norm, NULL,
  11798. LLM_NORM_RMS, cb, il);
  11799. cb(q, "q", il);
  11800. // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
  11801. q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
  11802. cb(q, "q", il);
  11803. } else {
  11804. q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  11805. cb(q, "q", il);
  11806. }
  11807. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  11808. struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
  11809. ggml_row_size(q->type, hparams.n_embd_head_k),
  11810. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  11811. 0);
  11812. cb(q_nope, "q_nope", il);
  11813. // and {n_head * n_embd_head_qk_rope, n_tokens}
  11814. struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
  11815. ggml_row_size(q->type, hparams.n_embd_head_k),
  11816. ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
  11817. ggml_row_size(q->type, n_embd_head_qk_nope));
  11818. cb(q_pe, "q_pe", il);
  11819. // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
  11820. struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
  11821. cb(kv_pe_compresseed, "kv_pe_compresseed", il);
  11822. // split into {kv_lora_rank, n_tokens}
  11823. struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
  11824. kv_pe_compresseed->nb[1],
  11825. 0);
  11826. cb(kv_compressed, "kv_compressed", il);
  11827. // and {n_embd_head_qk_rope, n_tokens}
  11828. struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
  11829. kv_pe_compresseed->nb[1],
  11830. kv_pe_compresseed->nb[1],
  11831. ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
  11832. cb(k_pe, "k_pe", il);
  11833. kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
  11834. kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
  11835. model.layers[il].attn_kv_a_norm, NULL,
  11836. LLM_NORM_RMS, cb, il);
  11837. cb(kv_compressed, "kv_compressed", il);
  11838. // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
  11839. struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
  11840. cb(kv, "kv", il);
  11841. // split into {n_head * n_embd_head_qk_nope, n_tokens}
  11842. struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
  11843. ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
  11844. ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  11845. 0);
  11846. cb(k_nope, "k_nope", il);
  11847. // and {n_head * n_embd_head_v, n_tokens}
  11848. struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
  11849. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
  11850. ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
  11851. ggml_row_size(kv->type, (n_embd_head_qk_nope)));
  11852. cb(v_states, "v_states", il);
  11853. v_states = ggml_cont(ctx0, v_states);
  11854. cb(v_states, "v_states", il);
  11855. v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
  11856. ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
  11857. 0);
  11858. cb(v_states, "v_states", il);
  11859. q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  11860. q_pe = ggml_rope_ext(
  11861. ctx0, q_pe, inp_pos, nullptr,
  11862. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11863. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  11864. );
  11865. cb(q_pe, "q_pe", il);
  11866. // shared RoPE key
  11867. k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
  11868. k_pe = ggml_rope_ext(
  11869. ctx0, k_pe, inp_pos, nullptr,
  11870. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11871. ext_factor, attn_factor_scaled, beta_fast, beta_slow
  11872. );
  11873. cb(k_pe, "k_pe", il);
  11874. struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
  11875. cb(q_states, "q_states", il);
  11876. struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
  11877. cb(k_states, "k_states", il);
  11878. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  11879. model.layers[il].wo, NULL,
  11880. k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
  11881. }
  11882. if (il == n_layer - 1) {
  11883. // skip computing output for unused tokens
  11884. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  11885. n_tokens = n_outputs;
  11886. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  11887. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  11888. }
  11889. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  11890. cb(ffn_inp, "ffn_inp", il);
  11891. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  11892. model.layers[il].ffn_norm, NULL,
  11893. LLM_NORM_RMS, cb, il);
  11894. cb(cur, "ffn_norm", il);
  11895. if ((uint32_t) il < hparams.n_layer_dense_lead) {
  11896. cur = llm_build_ffn(ctx0, lctx, cur,
  11897. model.layers[il].ffn_up, NULL, NULL,
  11898. model.layers[il].ffn_gate, NULL, NULL,
  11899. model.layers[il].ffn_down, NULL, NULL,
  11900. NULL,
  11901. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11902. cb(cur, "ffn_out", il);
  11903. } else {
  11904. // MoE branch
  11905. ggml_tensor * moe_out =
  11906. llm_build_moe_ffn(ctx0, lctx, cur,
  11907. model.layers[il].ffn_gate_inp,
  11908. model.layers[il].ffn_up_exps,
  11909. model.layers[il].ffn_gate_exps,
  11910. model.layers[il].ffn_down_exps,
  11911. n_expert, n_expert_used,
  11912. LLM_FFN_SILU, false,
  11913. true, hparams.expert_weights_scale,
  11914. cb, il);
  11915. cb(moe_out, "ffn_moe_out", il);
  11916. // FFN shared expert
  11917. {
  11918. ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
  11919. model.layers[il].ffn_up_shexp, NULL, NULL,
  11920. model.layers[il].ffn_gate_shexp, NULL, NULL,
  11921. model.layers[il].ffn_down_shexp, NULL, NULL,
  11922. NULL,
  11923. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  11924. cb(ffn_shexp, "ffn_shexp", il);
  11925. cur = ggml_add(ctx0, moe_out, ffn_shexp);
  11926. cb(cur, "ffn_out", il);
  11927. }
  11928. }
  11929. cur = ggml_add(ctx0, cur, ffn_inp);
  11930. cur = lctx.cvec.apply_to(ctx0, cur, il);
  11931. cb(cur, "l_out", il);
  11932. // input for next layer
  11933. inpL = cur;
  11934. }
  11935. cur = inpL;
  11936. cur = llm_build_norm(ctx0, cur, hparams,
  11937. model.output_norm, NULL,
  11938. LLM_NORM_RMS, cb, -1);
  11939. cb(cur, "result_norm", -1);
  11940. // lm_head
  11941. cur = ggml_mul_mat(ctx0, model.output, cur);
  11942. cb(cur, "result_output", -1);
  11943. ggml_build_forward_expand(gf, cur);
  11944. return gf;
  11945. }
  11946. struct ggml_cgraph * build_bitnet() {
  11947. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  11948. const int64_t n_embd_head = hparams.n_embd_head_v;
  11949. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  11950. struct ggml_tensor * cur;
  11951. struct ggml_tensor * inpL;
  11952. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  11953. // inp_pos - contains the positions
  11954. struct ggml_tensor * inp_pos = build_inp_pos();
  11955. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  11956. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  11957. for (int il = 0; il < n_layer; ++il) {
  11958. struct ggml_tensor * inpSA = inpL;
  11959. cur = llm_build_norm(ctx0, inpL, hparams,
  11960. model.layers[il].attn_norm, NULL,
  11961. LLM_NORM_RMS, cb, il);
  11962. cb(cur, "attn_norm", il);
  11963. // self-attention
  11964. {
  11965. // compute Q and K and RoPE them
  11966. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  11967. Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_scale);
  11968. cb(Qcur, "Qcur", il);
  11969. if (model.layers[il].bq) {
  11970. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  11971. cb(Qcur, "Qcur", il);
  11972. }
  11973. // B1.K
  11974. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  11975. Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_scale);
  11976. cb(Kcur, "Kcur", il);
  11977. if (model.layers[il].bk) {
  11978. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  11979. cb(Kcur, "Kcur", il);
  11980. }
  11981. // B1.V
  11982. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  11983. Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_scale);
  11984. cb(Vcur, "Vcur", il);
  11985. if (model.layers[il].bv) {
  11986. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  11987. cb(Vcur, "Vcur", il);
  11988. }
  11989. Qcur = ggml_rope_ext(
  11990. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  11991. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11992. ext_factor, attn_factor, beta_fast, beta_slow
  11993. );
  11994. cb(Qcur, "Qcur", il);
  11995. Kcur = ggml_rope_ext(
  11996. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  11997. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  11998. ext_factor, attn_factor, beta_fast, beta_slow
  11999. );
  12000. cb(Kcur, "Kcur", il);
  12001. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12002. NULL, NULL,
  12003. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  12004. cur = llm_build_norm(ctx0, cur, hparams,
  12005. model.layers[il].attn_sub_norm, NULL,
  12006. LLM_NORM_RMS, cb, il);
  12007. cb(cur, "attn_sub_norm", il);
  12008. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  12009. cur = ggml_mul(ctx0, cur, model.layers[il].wo_scale);
  12010. if (model.layers[il].bo) {
  12011. cur = ggml_add(ctx0, cur, model.layers[il].bo);
  12012. }
  12013. cb(cur, "attn_o_out", il);
  12014. }
  12015. if (il == n_layer - 1) {
  12016. // skip computing output for unused tokens
  12017. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12018. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12019. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12020. }
  12021. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12022. cb(ffn_inp, "ffn_inp", il);
  12023. // feed-forward forward
  12024. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12025. model.layers[il].ffn_norm, NULL,
  12026. LLM_NORM_RMS, cb, il);
  12027. cb(cur, "ffn_norm", il);
  12028. cur = llm_build_ffn(ctx0, lctx, cur,
  12029. model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
  12030. model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
  12031. NULL, NULL, NULL,
  12032. NULL,
  12033. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  12034. cb(cur, "ffn_sub_out", il);
  12035. cur = llm_build_norm(ctx0, cur, hparams,
  12036. model.layers[il].ffn_sub_norm, NULL,
  12037. LLM_NORM_RMS, cb, il);
  12038. cb(cur, "ffn_sub_norm", il);
  12039. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].ffn_down, cur);
  12040. cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
  12041. cb(cur, "ffn_down", il);
  12042. cur = ggml_add(ctx0, cur, ffn_inp);
  12043. cb(cur, "l_out", il);
  12044. // input for next layer
  12045. inpL = cur;
  12046. }
  12047. cur = inpL;
  12048. cur = llm_build_norm(ctx0, cur, hparams,
  12049. model.output_norm, NULL,
  12050. LLM_NORM_RMS, cb, -1);
  12051. cb(cur, "result_norm", -1);
  12052. // lm_head
  12053. cur = llm_build_lora_mm(lctx, ctx0, model.tok_embd, cur);
  12054. cb(cur, "result_output", -1);
  12055. ggml_build_forward_expand(gf, cur);
  12056. return gf;
  12057. }
  12058. struct ggml_cgraph * build_t5_encoder() {
  12059. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12060. // mutable variable, needed during the last layer of the computation to skip unused tokens
  12061. int32_t n_tokens = this->n_tokens;
  12062. const int64_t n_embd_head = hparams.n_embd_head_v;
  12063. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  12064. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12065. struct ggml_tensor * cur;
  12066. struct ggml_tensor * inpL;
  12067. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12068. GGML_ASSERT(lctx.is_encoding);
  12069. struct ggml_tensor * pos_bucket_enc = llm_build_pos_bucket(false);
  12070. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12071. struct ggml_tensor * KQ_mask_enc = build_inp_KQ_mask(false);
  12072. for (int il = 0; il < n_layer; ++il) {
  12073. struct ggml_tensor * inpSA = inpL;
  12074. // norm
  12075. cur = llm_build_norm(ctx0, inpL, hparams,
  12076. model.layers[il].attn_norm_enc, NULL,
  12077. LLM_NORM_RMS, cb, il);
  12078. cb(cur, "attn_norm", il);
  12079. // self-attention
  12080. {
  12081. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
  12082. cb(Qcur, "Qcur", il);
  12083. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
  12084. cb(Kcur, "Kcur", il);
  12085. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
  12086. cb(Vcur, "Vcur", il);
  12087. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12088. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
  12089. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  12090. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  12091. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  12092. cb(kq, "kq", il);
  12093. struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b_enc ? model.layers[il].attn_rel_b_enc : model.layers[0].attn_rel_b_enc;
  12094. struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_enc, attn_rel_b);
  12095. struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
  12096. cb(kq_b, "kq_b", il);
  12097. kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_enc, 1.0f, hparams.f_max_alibi_bias);
  12098. cb(kq, "kq_soft_max_ext", il);
  12099. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
  12100. cb(v, "v", il);
  12101. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_tokens, n_embd_head, n_head_kv), kq);
  12102. cb(kqv, "kqv", il);
  12103. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  12104. cb(kqv_merged, "kqv_merged", il);
  12105. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  12106. cb(cur, "kqv_merged_cont", il);
  12107. ggml_build_forward_expand(gf, cur);
  12108. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
  12109. cb(cur, "kqv_out", il);
  12110. }
  12111. if (il == n_layer - 1) {
  12112. // skip computing output for unused tokens
  12113. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12114. n_tokens = n_outputs;
  12115. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12116. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12117. }
  12118. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12119. cb(ffn_inp, "ffn_inp", il);
  12120. // feed-forward network
  12121. {
  12122. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12123. model.layers[il].ffn_norm_enc, NULL,
  12124. LLM_NORM_RMS, cb, il);
  12125. cb(cur, "ffn_norm", il);
  12126. // T5 uses relu, flan-T5 uses gelu-gated
  12127. cur = llm_build_ffn(ctx0, lctx, cur,
  12128. model.layers[il].ffn_up_enc, NULL, NULL,
  12129. model.layers[il].ffn_gate_enc, NULL, NULL,
  12130. model.layers[il].ffn_down_enc, NULL, NULL,
  12131. NULL,
  12132. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  12133. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  12134. cb, il);
  12135. cb(cur, "ffn_out", il);
  12136. }
  12137. cur = ggml_add(ctx0, cur, ffn_inp);
  12138. cb(cur, "ffn_out", il);
  12139. ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
  12140. if (layer_dir != nullptr) {
  12141. cur = ggml_add(ctx0, cur, layer_dir);
  12142. }
  12143. cb(cur, "l_out", il);
  12144. // input for next layer
  12145. inpL = cur;
  12146. }
  12147. cur = inpL;
  12148. cb(cur, "result_embd", -1);
  12149. cur = llm_build_norm(ctx0, cur, hparams,
  12150. model.output_norm_enc, NULL,
  12151. LLM_NORM_RMS, cb, -1);
  12152. cb(cur, "result_norm", -1);
  12153. ggml_build_forward_expand(gf, cur);
  12154. return gf;
  12155. }
  12156. struct ggml_cgraph * build_t5_decoder() {
  12157. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12158. // mutable variable, needed during the last layer of the computation to skip unused tokens
  12159. int32_t n_tokens = this->n_tokens;
  12160. const int64_t n_embd_head = hparams.n_embd_head_v;
  12161. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  12162. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12163. struct ggml_tensor * cur;
  12164. struct ggml_tensor * inpL;
  12165. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12166. GGML_ASSERT(!lctx.is_encoding);
  12167. GGML_ASSERT(n_outputs_enc > 0 && "call llama_encode() first");
  12168. struct ggml_tensor * embd_enc = llm_build_inp_embd_enc();
  12169. struct ggml_tensor * pos_bucket_dec = llm_build_pos_bucket(true);
  12170. struct ggml_tensor * KQ_mask_dec = build_inp_KQ_mask();
  12171. struct ggml_tensor * KQ_mask_cross = llm_build_inp_KQ_mask_cross();
  12172. for (int il = 0; il < n_layer; ++il) {
  12173. struct ggml_tensor * inpSA = inpL;
  12174. // norm
  12175. cur = llm_build_norm(ctx0, inpL, hparams,
  12176. model.layers[il].attn_norm, NULL,
  12177. LLM_NORM_RMS, cb, il);
  12178. cb(cur, "attn_norm", il);
  12179. // self-attention
  12180. {
  12181. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  12182. cb(Qcur, "Qcur", il);
  12183. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  12184. cb(Kcur, "Kcur", il);
  12185. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  12186. cb(Vcur, "Vcur", il);
  12187. llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
  12188. struct ggml_tensor * k =
  12189. ggml_view_3d(ctx0, kv_self.k_l[il],
  12190. n_embd_head_k, n_kv, n_head_kv,
  12191. ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
  12192. ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
  12193. 0);
  12194. cb(k, "k", il);
  12195. struct ggml_tensor * v =
  12196. ggml_view_3d(ctx0, kv_self.v_l[il],
  12197. n_kv, n_embd_head_v, n_head_kv,
  12198. ggml_element_size(kv_self.v_l[il])*n_ctx,
  12199. ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head_v,
  12200. 0);
  12201. cb(v, "v", il);
  12202. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12203. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  12204. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  12205. cb(kq, "kq", il);
  12206. struct ggml_tensor * attn_rel_b = model.layers[il].attn_rel_b ? model.layers[il].attn_rel_b : model.layers[0].attn_rel_b;
  12207. struct ggml_tensor * pos_bias = llm_build_pos_bias(pos_bucket_dec, attn_rel_b);
  12208. struct ggml_tensor * kq_b = ggml_add(ctx0, kq, pos_bias);
  12209. cb(kq_b, "kq_b", il);
  12210. kq = ggml_soft_max_ext(ctx0, kq_b, KQ_mask_dec, 1.0f, hparams.f_max_alibi_bias);
  12211. cb(kq, "kq_soft_max_ext", il);
  12212. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
  12213. cb(kqv, "kqv", il);
  12214. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  12215. cb(kqv_merged, "kqv_merged", il);
  12216. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  12217. cb(cur, "kqv_merged_cont", il);
  12218. ggml_build_forward_expand(gf, cur);
  12219. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
  12220. cb(cur, "kqv_out", il);
  12221. }
  12222. cur = ggml_add(ctx0, cur, inpSA);
  12223. cb(cur, "cross_inp", il);
  12224. struct ggml_tensor * inpCA = cur;
  12225. // norm
  12226. cur = llm_build_norm(ctx0, cur, hparams,
  12227. model.layers[il].attn_norm_cross, NULL,
  12228. LLM_NORM_RMS, cb, il);
  12229. cb(cur, "attn_norm_cross", il);
  12230. // cross-attention
  12231. {
  12232. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
  12233. cb(Qcur, "Qcur", il);
  12234. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
  12235. cb(Kcur, "Kcur", il);
  12236. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
  12237. cb(Vcur, "Vcur", il);
  12238. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12239. Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_outputs_enc);
  12240. struct ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
  12241. struct ggml_tensor * k = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
  12242. struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
  12243. cb(kq, "kq", il);
  12244. kq = ggml_soft_max_ext(ctx0, kq, KQ_mask_cross, 1.0f, hparams.f_max_alibi_bias);
  12245. cb(kq, "kq_soft_max_ext", il);
  12246. struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_outputs_enc)));
  12247. cb(v, "v", il);
  12248. struct ggml_tensor * kqv = ggml_mul_mat(ctx0, ggml_reshape_3d(ctx0, v, n_outputs_enc, n_embd_head, n_head_kv), kq);
  12249. cb(kqv, "kqv", il);
  12250. struct ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
  12251. cb(kqv_merged, "kqv_merged", il);
  12252. cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_gqa, n_tokens);
  12253. cb(cur, "kqv_merged_cont", il);
  12254. ggml_build_forward_expand(gf, cur);
  12255. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
  12256. cb(cur, "kqv_out", il);
  12257. }
  12258. if (il == n_layer - 1) {
  12259. // skip computing output for unused tokens
  12260. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12261. n_tokens = n_outputs;
  12262. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12263. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12264. inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
  12265. }
  12266. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpCA);
  12267. cb(ffn_inp, "ffn_inp", il);
  12268. // feed-forward network
  12269. {
  12270. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12271. model.layers[il].ffn_norm, NULL,
  12272. LLM_NORM_RMS, cb, il);
  12273. cb(cur, "ffn_norm", il);
  12274. // T5 uses relu, flan-T5 uses gelu-gated
  12275. cur = llm_build_ffn(ctx0, lctx, cur,
  12276. model.layers[il].ffn_up, NULL, NULL,
  12277. model.layers[il].ffn_gate, NULL, NULL,
  12278. model.layers[il].ffn_down, NULL, NULL,
  12279. NULL,
  12280. model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
  12281. model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
  12282. cb, il);
  12283. cb(cur, "ffn_out", il);
  12284. }
  12285. cur = ggml_add(ctx0, cur, ffn_inp);
  12286. cb(cur, "ffn_out", il);
  12287. ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
  12288. if (layer_dir != nullptr) {
  12289. cur = ggml_add(ctx0, cur, layer_dir);
  12290. }
  12291. cb(cur, "l_out", il);
  12292. // input for next layer
  12293. inpL = cur;
  12294. }
  12295. cur = inpL;
  12296. cb(cur, "result_embd", -1);
  12297. cur = llm_build_norm(ctx0, cur, hparams,
  12298. model.output_norm, NULL,
  12299. LLM_NORM_RMS, cb, -1);
  12300. cb(cur, "result_norm", -1);
  12301. // lm_head
  12302. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12303. cb(cur, "result_output", -1);
  12304. ggml_build_forward_expand(gf, cur);
  12305. return gf;
  12306. }
  12307. struct ggml_cgraph * build_jais() {
  12308. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12309. const int64_t n_embd_head = hparams.n_embd_head_v;
  12310. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  12311. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12312. struct ggml_tensor * cur;
  12313. struct ggml_tensor * inpL;
  12314. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12315. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12316. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  12317. for (int il = 0; il < n_layer; ++il) {
  12318. cur = llm_build_norm(ctx0, inpL, hparams,
  12319. model.layers[il].attn_norm,
  12320. model.layers[il].attn_norm_b,
  12321. LLM_NORM, cb, il);
  12322. cb(cur, "attn_norm", il);
  12323. // self-attention
  12324. {
  12325. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  12326. cb(cur, "wqkv", il);
  12327. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  12328. cb(cur, "bqkv", il);
  12329. struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd)));
  12330. struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd)));
  12331. struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa)));
  12332. cb(Qcur, "Qcur", il);
  12333. cb(Kcur, "Kcur", il);
  12334. cb(Vcur, "Vcur", il);
  12335. Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
  12336. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12337. model.layers[il].wo, model.layers[il].bo,
  12338. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il);
  12339. }
  12340. if (il == n_layer - 1) {
  12341. // skip computing output for unused tokens
  12342. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12343. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12344. inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
  12345. }
  12346. // add the input
  12347. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
  12348. cb(ffn_inp, "ffn_inp", il);
  12349. // FF
  12350. {
  12351. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12352. model.layers[il].ffn_norm,
  12353. model.layers[il].ffn_norm_b,
  12354. LLM_NORM, cb, il);
  12355. cb(cur, "ffn_norm", il);
  12356. cur = llm_build_ffn(ctx0, lctx, cur,
  12357. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12358. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12359. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12360. NULL,
  12361. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  12362. cb(cur, "ffn_out", il);
  12363. }
  12364. inpL = ggml_add(ctx0, cur, ffn_inp);
  12365. cb(inpL, "l_out", il);
  12366. }
  12367. cur = llm_build_norm(ctx0, inpL, hparams,
  12368. model.output_norm,
  12369. model.output_norm_b,
  12370. LLM_NORM, cb, -1);
  12371. cb(cur, "result_norm", -1);
  12372. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12373. cb(cur, "result_output", -1);
  12374. ggml_build_forward_expand(gf, cur);
  12375. return gf;
  12376. }
  12377. struct ggml_cgraph * build_chatglm() {
  12378. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12379. const int64_t n_embd_head = hparams.n_embd_head_v;
  12380. const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
  12381. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12382. struct ggml_tensor * cur;
  12383. struct ggml_tensor * inpL;
  12384. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12385. // inp_pos - contains the positions
  12386. struct ggml_tensor * inp_pos = build_inp_pos();
  12387. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12388. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  12389. for (int il = 0; il < n_layer; ++il) {
  12390. struct ggml_tensor * inpSA = inpL;
  12391. cur = llm_build_norm(ctx0, inpL, hparams,
  12392. model.layers[il].attn_norm,
  12393. NULL,
  12394. LLM_NORM_RMS, cb, il);
  12395. cb(cur, "attn_norm", il);
  12396. // self-attention
  12397. {
  12398. struct ggml_tensor * Qcur = nullptr;
  12399. struct ggml_tensor * Kcur = nullptr;
  12400. struct ggml_tensor * Vcur = nullptr;
  12401. cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
  12402. cb(cur, "wqkv", il);
  12403. cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
  12404. cb(cur, "bqkv", il);
  12405. Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
  12406. Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
  12407. Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
  12408. cb(Qcur, "Qcur", il);
  12409. cb(Kcur, "Kcur", il);
  12410. cb(Vcur, "Vcur", il);
  12411. //printf("freq_base: %f freq_scale: %f ext_factor: %f attn_factor: %f\n", freq_base, freq_scale, ext_factor, attn_factor);
  12412. Qcur = ggml_rope_ext(
  12413. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  12414. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12415. ext_factor, attn_factor, beta_fast, beta_slow
  12416. );
  12417. cb(Qcur, "Qcur_rope", il);
  12418. Kcur = ggml_rope_ext(
  12419. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  12420. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12421. ext_factor, attn_factor, beta_fast, beta_slow
  12422. );
  12423. cb(Kcur, "Kcur_rope", il);
  12424. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12425. model.layers[il].wo, NULL,
  12426. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  12427. }
  12428. if (il == n_layer - 1) {
  12429. // skip computing output for unused tokens
  12430. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12431. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12432. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12433. }
  12434. // Add the input
  12435. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12436. cb(ffn_inp, "ffn_inp", il);
  12437. // FF
  12438. {
  12439. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12440. model.layers[il].ffn_norm,
  12441. NULL,
  12442. LLM_NORM_RMS, cb, il);
  12443. cb(cur, "ffn_norm", il);
  12444. cur = llm_build_ffn(ctx0, lctx, cur,
  12445. model.layers[il].ffn_up, NULL, NULL,
  12446. NULL, NULL, NULL,
  12447. model.layers[il].ffn_down, NULL, NULL,
  12448. NULL,
  12449. LLM_FFN_SWIGLU, LLM_FFN_SEQ, cb, il);
  12450. cb(cur, "ffn_out", il);
  12451. }
  12452. inpL = ggml_add(ctx0, cur, ffn_inp);
  12453. cb(inpL, "l_out", il);
  12454. }
  12455. cur = llm_build_norm(ctx0, inpL, hparams,
  12456. model.output_norm,
  12457. NULL,
  12458. LLM_NORM_RMS, cb, -1);
  12459. cb(cur, "result_norm", -1);
  12460. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12461. cb(cur, "result_output", -1);
  12462. ggml_build_forward_expand(gf, cur);
  12463. return gf;
  12464. }
  12465. struct ggml_cgraph * build_nemotron() {
  12466. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12467. const int64_t n_embd_head = hparams.n_embd_head_v;
  12468. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12469. //GGML_ASSERT(n_embd_head == hparams.n_rot);
  12470. struct ggml_tensor * cur;
  12471. struct ggml_tensor * inpL;
  12472. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12473. // inp_pos - contains the positions
  12474. struct ggml_tensor * inp_pos = build_inp_pos();
  12475. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12476. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  12477. for (int il = 0; il < n_layer; ++il) {
  12478. struct ggml_tensor * inpSA = inpL;
  12479. // norm
  12480. cur = llm_build_norm(ctx0, inpL, hparams,
  12481. model.layers[il].attn_norm,
  12482. model.layers[il].attn_norm_b,
  12483. LLM_NORM, cb, il);
  12484. cb(cur, "attn_norm", il);
  12485. // self-attention
  12486. {
  12487. // compute Q and K and RoPE them
  12488. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  12489. cb(Qcur, "Qcur", il);
  12490. if (model.layers[il].bq) {
  12491. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12492. cb(Qcur, "Qcur", il);
  12493. }
  12494. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  12495. cb(Kcur, "Kcur", il);
  12496. if (model.layers[il].bk) {
  12497. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12498. cb(Kcur, "Kcur", il);
  12499. }
  12500. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  12501. cb(Vcur, "Vcur", il);
  12502. if (model.layers[il].bv) {
  12503. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12504. cb(Vcur, "Vcur", il);
  12505. }
  12506. Qcur = ggml_rope_ext(
  12507. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
  12508. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12509. ext_factor, attn_factor, beta_fast, beta_slow
  12510. );
  12511. cb(Qcur, "Qcur", il);
  12512. Kcur = ggml_rope_ext(
  12513. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
  12514. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12515. ext_factor, attn_factor, beta_fast, beta_slow
  12516. );
  12517. cb(Kcur, "Kcur", il);
  12518. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12519. model.layers[il].wo, model.layers[il].bo,
  12520. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  12521. }
  12522. if (il == n_layer - 1) {
  12523. // skip computing output for unused tokens
  12524. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12525. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12526. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12527. }
  12528. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12529. cb(ffn_inp, "ffn_inp", il);
  12530. // feed-forward network
  12531. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12532. model.layers[il].ffn_norm,
  12533. model.layers[il].ffn_norm_b,
  12534. LLM_NORM, cb, il);
  12535. cb(cur, "ffn_norm", il);
  12536. cur = llm_build_ffn(ctx0, lctx, cur,
  12537. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12538. NULL, NULL, NULL,
  12539. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12540. NULL,
  12541. LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
  12542. cur = ggml_add(ctx0, cur, ffn_inp);
  12543. cb(cur, "ffn_out", il);
  12544. cur = lctx.cvec.apply_to(ctx0, cur, il);
  12545. cb(cur, "l_out", il);
  12546. // input for next layer
  12547. inpL = cur;
  12548. }
  12549. cur = inpL;
  12550. cur = llm_build_norm(ctx0, cur, hparams,
  12551. model.output_norm, model.output_norm_b,
  12552. LLM_NORM, cb, -1);
  12553. cb(cur, "result_norm", -1);
  12554. // lm_head
  12555. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12556. cb(cur, "result_output", -1);
  12557. ggml_build_forward_expand(gf, cur);
  12558. return gf;
  12559. }
  12560. struct ggml_cgraph * build_exaone() {
  12561. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12562. // mutable variable, needed during the last layer of the computation to skip unused tokens
  12563. int32_t n_tokens = this->n_tokens;
  12564. const int64_t n_embd_head = hparams.n_embd_head_v;
  12565. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12566. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12567. struct ggml_tensor * cur;
  12568. struct ggml_tensor * inpL;
  12569. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12570. // inp_pos - contains the positions
  12571. struct ggml_tensor * inp_pos = build_inp_pos();
  12572. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12573. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  12574. for (int il = 0; il < n_layer; ++il) {
  12575. struct ggml_tensor * inpSA = inpL;
  12576. // norm
  12577. cur = llm_build_norm(ctx0, inpL, hparams,
  12578. model.layers[il].attn_norm, NULL,
  12579. LLM_NORM_RMS, cb, il);
  12580. cb(cur, "attn_norm", il);
  12581. // self-attention
  12582. {
  12583. // rope freq factors for llama3; may return nullptr for llama2 and other models
  12584. struct ggml_tensor * rope_factors = build_rope_factors(il);
  12585. // compute Q and K and RoPE them
  12586. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  12587. cb(Qcur, "Qcur", il);
  12588. if (model.layers[il].bq) {
  12589. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12590. cb(Qcur, "Qcur", il);
  12591. }
  12592. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  12593. cb(Kcur, "Kcur", il);
  12594. if (model.layers[il].bk) {
  12595. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12596. cb(Kcur, "Kcur", il);
  12597. }
  12598. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  12599. cb(Vcur, "Vcur", il);
  12600. if (model.layers[il].bv) {
  12601. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12602. cb(Vcur, "Vcur", il);
  12603. }
  12604. Qcur = ggml_rope_ext(
  12605. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  12606. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12607. ext_factor, attn_factor, beta_fast, beta_slow
  12608. );
  12609. cb(Qcur, "Qcur", il);
  12610. Kcur = ggml_rope_ext(
  12611. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  12612. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12613. ext_factor, attn_factor, beta_fast, beta_slow
  12614. );
  12615. cb(Kcur, "Kcur", il);
  12616. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12617. model.layers[il].wo, model.layers[il].bo,
  12618. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  12619. }
  12620. if (il == n_layer - 1) {
  12621. // skip computing output for unused tokens
  12622. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12623. n_tokens = n_outputs;
  12624. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12625. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12626. }
  12627. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12628. cb(ffn_inp, "ffn_inp", il);
  12629. // feed-forward network
  12630. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12631. model.layers[il].ffn_norm, NULL,
  12632. LLM_NORM_RMS, cb, il);
  12633. cb(cur, "ffn_norm", il);
  12634. cur = llm_build_ffn(ctx0, lctx, cur,
  12635. model.layers[il].ffn_up, NULL, NULL,
  12636. model.layers[il].ffn_gate, NULL, NULL,
  12637. model.layers[il].ffn_down, NULL, NULL,
  12638. NULL,
  12639. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  12640. cb(cur, "ffn_out", il);
  12641. cur = ggml_add(ctx0, cur, ffn_inp);
  12642. cb(cur, "ffn_out", il);
  12643. cur = lctx.cvec.apply_to(ctx0, cur, il);
  12644. cb(cur, "l_out", il);
  12645. // input for next layer
  12646. inpL = cur;
  12647. }
  12648. cur = inpL;
  12649. cur = llm_build_norm(ctx0, cur, hparams,
  12650. model.output_norm, NULL,
  12651. LLM_NORM_RMS, cb, -1);
  12652. cb(cur, "result_norm", -1);
  12653. // lm_head
  12654. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12655. cb(cur, "result_output", -1);
  12656. ggml_build_forward_expand(gf, cur);
  12657. return gf;
  12658. }
  12659. ggml_cgraph * build_rwkv6() {
  12660. ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12661. // Token shift state dimensions should be 2 * n_emb
  12662. GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
  12663. const int64_t n_seqs = batch.n_seqs;
  12664. const int64_t n_seq_tokens = batch.n_seq_tokens;
  12665. const int64_t n_tokens = batch.n_tokens;
  12666. GGML_ASSERT(n_seqs != 0);
  12667. GGML_ASSERT(batch.equal_seqs);
  12668. GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
  12669. struct ggml_tensor * cur;
  12670. struct ggml_tensor * inpL;
  12671. struct ggml_tensor * state_copy = build_inp_s_copy();
  12672. struct ggml_tensor * state_mask = build_inp_s_mask();
  12673. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12674. inpL = llm_build_norm(ctx0, inpL, hparams, model.tok_norm, model.tok_norm_b, LLM_NORM, cb, -1);
  12675. for (int il = 0; il < n_layer; ++il) {
  12676. const llama_layer * layer = &model.layers[il];
  12677. // (ab)using the KV cache to store the states
  12678. struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
  12679. gf, kv_self.k_l[il], state_copy, state_mask,
  12680. hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
  12681. struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
  12682. gf, kv_self.v_l[il], state_copy, state_mask,
  12683. hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
  12684. cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
  12685. token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 2, n_seqs);
  12686. struct ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0);
  12687. struct ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift));
  12688. struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM, cb, il);
  12689. struct ggml_tensor * x_prev = ggml_concat(
  12690. ctx0,
  12691. att_shift,
  12692. ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
  12693. 1
  12694. );
  12695. cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
  12696. ggml_build_forward_expand(gf, cur);
  12697. ggml_build_forward_expand(
  12698. gf,
  12699. ggml_cpy(
  12700. ctx0,
  12701. wkv_states,
  12702. ggml_view_1d(
  12703. ctx0,
  12704. kv_self.v_l[il],
  12705. hparams.n_embd_v_s() * n_seqs,
  12706. hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
  12707. )
  12708. )
  12709. );
  12710. struct ggml_tensor * x_norm_ffn = llm_build_norm(ctx0, cur, hparams, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, cb, il);
  12711. x_prev = ggml_concat(
  12712. ctx0,
  12713. ffn_shift,
  12714. ggml_view_3d(ctx0, x_norm_ffn, n_embd, n_seq_tokens - 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], 0),
  12715. 1
  12716. );
  12717. cur = ggml_add(ctx0, cur, llm_build_rwkv6_channel_mix(lctx, ctx0, layer, x_norm_ffn, x_prev));
  12718. ggml_build_forward_expand(gf, cur);
  12719. struct ggml_tensor * last_norm_att = ggml_view_3d(ctx0, x_norm_att, n_embd, 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_att));
  12720. struct ggml_tensor * last_norm_ffn = ggml_view_3d(ctx0, x_norm_ffn, n_embd, 1, n_seqs, x_norm_ffn->nb[1], x_norm_ffn->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(x_norm_ffn));
  12721. token_shift = ggml_concat(ctx0, last_norm_att, last_norm_ffn, 1);
  12722. ggml_build_forward_expand(
  12723. gf,
  12724. ggml_cpy(
  12725. ctx0,
  12726. ggml_view_1d(ctx0, token_shift, n_embd * n_seqs * 2, 0),
  12727. ggml_view_1d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s() * n_seqs, hparams.n_embd_k_s() * kv_head * ggml_element_size(kv_self.k_l[il]))
  12728. )
  12729. );
  12730. if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) {
  12731. cur = ggml_scale(ctx0, cur, 0.5F);
  12732. }
  12733. cur = lctx.cvec.apply_to(ctx0, cur, il);
  12734. cb(cur, "l_out", il);
  12735. // input for next layer
  12736. inpL = cur;
  12737. }
  12738. cur = inpL;
  12739. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12740. cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
  12741. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12742. cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM, cb, -1);
  12743. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12744. cb(cur, "result_output", -1);
  12745. ggml_build_forward_expand(gf, cur);
  12746. return gf;
  12747. }
  12748. ggml_cgraph * build_solar() {
  12749. struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
  12750. // mutable variable, needed during the last layer of the computation to skip unused tokens
  12751. int32_t n_tokens = this->n_tokens;
  12752. const int64_t n_embd_head = hparams.n_embd_head_v;
  12753. GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
  12754. GGML_ASSERT(n_embd_head == hparams.n_rot);
  12755. struct ggml_tensor * cur;
  12756. struct ggml_tensor * inpL;
  12757. inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  12758. // inp_pos - contains the positions
  12759. struct ggml_tensor * inp_pos = build_inp_pos();
  12760. // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
  12761. struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
  12762. struct ggml_tensor * bskcn_1;
  12763. struct ggml_tensor * bskcn_2;
  12764. for (int il = 0; il < n_layer; ++il) {
  12765. struct ggml_tensor * inpSA = inpL;
  12766. if (hparams.n_bskcn(0, il)) {
  12767. bskcn_1 = inpSA;
  12768. }
  12769. if (hparams.n_bskcn(1, il)) {
  12770. bskcn_2 = inpSA;
  12771. }
  12772. if (hparams.n_bskcn(2, il)) {
  12773. inpSA = ggml_add(
  12774. ctx0,
  12775. ggml_mul(ctx0, bskcn_1, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
  12776. ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
  12777. }
  12778. if (hparams.n_bskcn(3, il)) {
  12779. inpSA = ggml_add(
  12780. ctx0,
  12781. ggml_mul(ctx0, bskcn_2, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, 0)),
  12782. ggml_mul(ctx0, inpSA, ggml_view_1d(ctx0, model.layers[il].bskcn_tv, 1, ggml_element_size(model.layers[il].bskcn_tv))));
  12783. }
  12784. // norm
  12785. cur = llm_build_norm(ctx0, inpL, hparams,
  12786. model.layers[il].attn_norm, NULL,
  12787. LLM_NORM_RMS, cb, il);
  12788. cb(cur, "attn_norm", il);
  12789. // self-attention
  12790. {
  12791. // rope freq factors for llama3; may return nullptr for llama2 and other models
  12792. struct ggml_tensor * rope_factors = build_rope_factors(il);
  12793. // compute Q and K and RoPE them
  12794. struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
  12795. cb(Qcur, "Qcur", il);
  12796. if (model.layers[il].bq) {
  12797. Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
  12798. cb(Qcur, "Qcur", il);
  12799. }
  12800. struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
  12801. cb(Kcur, "Kcur", il);
  12802. if (model.layers[il].bk) {
  12803. Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
  12804. cb(Kcur, "Kcur", il);
  12805. }
  12806. struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
  12807. cb(Vcur, "Vcur", il);
  12808. if (model.layers[il].bv) {
  12809. Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
  12810. cb(Vcur, "Vcur", il);
  12811. }
  12812. Qcur = ggml_rope_ext(
  12813. ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
  12814. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12815. ext_factor, attn_factor, beta_fast, beta_slow
  12816. );
  12817. cb(Qcur, "Qcur", il);
  12818. Kcur = ggml_rope_ext(
  12819. ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
  12820. n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
  12821. ext_factor, attn_factor, beta_fast, beta_slow
  12822. );
  12823. cb(Kcur, "Kcur", il);
  12824. cur = llm_build_kv(ctx0, lctx, kv_self, gf,
  12825. model.layers[il].wo, model.layers[il].bo,
  12826. Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
  12827. }
  12828. if (il == n_layer - 1) {
  12829. // skip computing output for unused tokens
  12830. struct ggml_tensor * inp_out_ids = build_inp_out_ids();
  12831. n_tokens = n_outputs;
  12832. cur = ggml_get_rows(ctx0, cur, inp_out_ids);
  12833. inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
  12834. }
  12835. struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
  12836. cb(ffn_inp, "ffn_inp", il);
  12837. // feed-forward network
  12838. cur = llm_build_norm(ctx0, ffn_inp, hparams,
  12839. model.layers[il].ffn_norm, NULL,
  12840. LLM_NORM_RMS, cb, il);
  12841. cb(cur, "ffn_norm", il);
  12842. cur = llm_build_ffn(ctx0, lctx, cur,
  12843. model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
  12844. model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
  12845. model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
  12846. NULL,
  12847. LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
  12848. cb(cur, "ffn_out", il);
  12849. cur = ggml_add(ctx0, cur, ffn_inp);
  12850. cb(cur, "ffn_out", il);
  12851. cur = lctx.cvec.apply_to(ctx0, cur, il);
  12852. cb(cur, "l_out", il);
  12853. // input for next layer
  12854. inpL = cur;
  12855. }
  12856. cur = inpL;
  12857. cur = llm_build_norm(ctx0, cur, hparams,
  12858. model.output_norm, NULL,
  12859. LLM_NORM_RMS, cb, -1);
  12860. cb(cur, "result_norm", -1);
  12861. // lm_head
  12862. cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
  12863. cb(cur, "result_output", -1);
  12864. ggml_build_forward_expand(gf, cur);
  12865. return gf;
  12866. }
  12867. };
  12868. static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
  12869. llama_ubatch dummy = {};
  12870. dummy.equal_seqs = true;
  12871. llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
  12872. struct llm_build_context llm(lctx, dummy, cb, false);
  12873. llm.init();
  12874. struct ggml_cgraph * result = llm.build_defrag(ids);
  12875. llm.free();
  12876. return result;
  12877. }
  12878. static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
  12879. llama_ubatch dummy = {};
  12880. dummy.equal_seqs = true;
  12881. llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
  12882. struct llm_build_context llm(lctx, dummy, cb, false);
  12883. llm.init();
  12884. struct ggml_cgraph * result = llm.build_k_shift();
  12885. llm.free();
  12886. return result;
  12887. }
  12888. static struct ggml_cgraph * llama_build_graph(
  12889. llama_context & lctx,
  12890. const llama_ubatch & batch,
  12891. bool worst_case) {
  12892. const auto & model = lctx.model;
  12893. // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
  12894. llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
  12895. if (il >= 0) {
  12896. ggml_format_name(cur, "%s-%d", name, il);
  12897. } else {
  12898. ggml_set_name(cur, name);
  12899. }
  12900. if (!lctx.cparams.offload_kqv) {
  12901. if (strcmp(name, "kqv_merged_cont") == 0) {
  12902. // all nodes between the KV store and the attention output are run on the CPU
  12903. ggml_backend_sched_set_tensor_backend(lctx.sched, cur, lctx.backend_cpu);
  12904. }
  12905. }
  12906. // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
  12907. // FIXME: fix in ggml_backend_sched
  12908. const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
  12909. if (batch.n_tokens < 32 || full_offload) {
  12910. if (il != -1 && strcmp(name, "norm") == 0) {
  12911. for (auto * backend : lctx.backends) {
  12912. if (ggml_backend_supports_buft(backend, lctx.model.buft_layer[il].buft) &&
  12913. (ggml_backend_supports_op(backend, cur) || ggml_backend_offload_op(backend, cur))) {
  12914. ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
  12915. break;
  12916. }
  12917. }
  12918. }
  12919. }
  12920. };
  12921. struct ggml_cgraph * result = NULL;
  12922. struct llm_build_context llm(lctx, batch, cb, worst_case);
  12923. llm.init();
  12924. switch (model.arch) {
  12925. case LLM_ARCH_LLAMA:
  12926. {
  12927. result = llm.build_llama();
  12928. } break;
  12929. case LLM_ARCH_BAICHUAN:
  12930. {
  12931. result = llm.build_baichuan();
  12932. } break;
  12933. case LLM_ARCH_FALCON:
  12934. {
  12935. result = llm.build_falcon();
  12936. } break;
  12937. case LLM_ARCH_GROK:
  12938. {
  12939. result = llm.build_grok();
  12940. } break;
  12941. case LLM_ARCH_STARCODER:
  12942. {
  12943. result = llm.build_starcoder();
  12944. } break;
  12945. case LLM_ARCH_REFACT:
  12946. {
  12947. result = llm.build_refact();
  12948. } break;
  12949. case LLM_ARCH_BERT:
  12950. case LLM_ARCH_JINA_BERT_V2:
  12951. case LLM_ARCH_NOMIC_BERT:
  12952. {
  12953. result = llm.build_bert();
  12954. } break;
  12955. case LLM_ARCH_BLOOM:
  12956. {
  12957. result = llm.build_bloom();
  12958. } break;
  12959. case LLM_ARCH_MPT:
  12960. {
  12961. result = llm.build_mpt();
  12962. } break;
  12963. case LLM_ARCH_STABLELM:
  12964. {
  12965. result = llm.build_stablelm();
  12966. } break;
  12967. case LLM_ARCH_QWEN:
  12968. {
  12969. result = llm.build_qwen();
  12970. } break;
  12971. case LLM_ARCH_QWEN2:
  12972. {
  12973. result = llm.build_qwen2();
  12974. } break;
  12975. case LLM_ARCH_QWEN2MOE:
  12976. {
  12977. result = llm.build_qwen2moe();
  12978. } break;
  12979. case LLM_ARCH_PHI2:
  12980. {
  12981. result = llm.build_phi2();
  12982. } break;
  12983. case LLM_ARCH_PHI3:
  12984. {
  12985. result = llm.build_phi3();
  12986. } break;
  12987. case LLM_ARCH_PLAMO:
  12988. {
  12989. result = llm.build_plamo();
  12990. } break;
  12991. case LLM_ARCH_GPT2:
  12992. {
  12993. result = llm.build_gpt2();
  12994. } break;
  12995. case LLM_ARCH_CODESHELL:
  12996. {
  12997. result = llm.build_codeshell();
  12998. } break;
  12999. case LLM_ARCH_ORION:
  13000. {
  13001. result = llm.build_orion();
  13002. } break;
  13003. case LLM_ARCH_INTERNLM2:
  13004. {
  13005. result = llm.build_internlm2();
  13006. } break;
  13007. case LLM_ARCH_MINICPM:
  13008. {
  13009. result = llm.build_minicpm();
  13010. } break;
  13011. case LLM_ARCH_GEMMA:
  13012. {
  13013. result = llm.build_gemma();
  13014. } break;
  13015. case LLM_ARCH_GEMMA2:
  13016. {
  13017. result = llm.build_gemma2();
  13018. } break;
  13019. case LLM_ARCH_STARCODER2:
  13020. {
  13021. result = llm.build_starcoder2();
  13022. } break;
  13023. case LLM_ARCH_MAMBA:
  13024. {
  13025. result = llm.build_mamba();
  13026. } break;
  13027. case LLM_ARCH_XVERSE:
  13028. {
  13029. result = llm.build_xverse();
  13030. } break;
  13031. case LLM_ARCH_COMMAND_R:
  13032. {
  13033. result = llm.build_command_r();
  13034. } break;
  13035. case LLM_ARCH_DBRX:
  13036. {
  13037. result = llm.build_dbrx();
  13038. } break;
  13039. case LLM_ARCH_OLMO:
  13040. {
  13041. result = llm.build_olmo();
  13042. } break;
  13043. case LLM_ARCH_OPENELM:
  13044. {
  13045. result = llm.build_openelm();
  13046. } break;
  13047. case LLM_ARCH_GPTNEOX:
  13048. {
  13049. result = llm.build_gptneox();
  13050. } break;
  13051. case LLM_ARCH_ARCTIC:
  13052. {
  13053. result = llm.build_arctic();
  13054. } break;
  13055. case LLM_ARCH_DEEPSEEK2:
  13056. {
  13057. result = llm.build_deepseek2();
  13058. } break;
  13059. case LLM_ARCH_CHATGLM:
  13060. {
  13061. result = llm.build_chatglm();
  13062. } break;
  13063. case LLM_ARCH_BITNET:
  13064. {
  13065. result = llm.build_bitnet();
  13066. } break;
  13067. case LLM_ARCH_T5:
  13068. {
  13069. if (lctx.is_encoding) {
  13070. result = llm.build_t5_encoder();
  13071. } else {
  13072. result = llm.build_t5_decoder();
  13073. }
  13074. } break;
  13075. case LLM_ARCH_T5ENCODER:
  13076. {
  13077. result = llm.build_t5_encoder();
  13078. } break;
  13079. case LLM_ARCH_JAIS:
  13080. {
  13081. result = llm.build_jais();
  13082. } break;
  13083. case LLM_ARCH_NEMOTRON:
  13084. {
  13085. result = llm.build_nemotron();
  13086. } break;
  13087. case LLM_ARCH_EXAONE:
  13088. {
  13089. result = llm.build_exaone();
  13090. } break;
  13091. case LLM_ARCH_RWKV6:
  13092. {
  13093. result = llm.build_rwkv6();
  13094. } break;
  13095. case LLM_ARCH_SOLAR:
  13096. {
  13097. result = llm.build_solar();
  13098. } break;
  13099. default:
  13100. GGML_ABORT("fatal error");
  13101. }
  13102. // add on pooling layer
  13103. if (lctx.cparams.embeddings) {
  13104. result = llm.append_pooling(result);
  13105. }
  13106. llm.free();
  13107. return result;
  13108. }
  13109. static void llama_set_k_shift(llama_context & lctx) {
  13110. const int64_t kv_size = lctx.kv_self.size;
  13111. assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
  13112. int32_t * data = (int32_t *) lctx.inp_K_shift->data;
  13113. for (int i = 0; i < kv_size; ++i) {
  13114. data[i] = lctx.kv_self.cells[i].delta;
  13115. }
  13116. }
  13117. static void llama_set_s_copy(llama_context & lctx) {
  13118. const int64_t kv_size = lctx.kv_self.size;
  13119. assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  13120. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  13121. for (int i = 0; i < kv_size; ++i) {
  13122. data[i] = lctx.kv_self.cells[i].src;
  13123. }
  13124. }
  13125. static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
  13126. // TODO move to hparams if a T5 variant appears that uses a different value
  13127. const int64_t max_distance = 128;
  13128. if (bidirectional) {
  13129. n_buckets >>= 1;
  13130. }
  13131. const int64_t max_exact = n_buckets >> 1;
  13132. int32_t relative_position = x - y;
  13133. int32_t relative_bucket = 0;
  13134. if (bidirectional) {
  13135. relative_bucket += (relative_position > 0) * n_buckets;
  13136. relative_position = abs(relative_position);
  13137. } else {
  13138. relative_position = -std::min<int32_t>(relative_position, 0);
  13139. }
  13140. int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
  13141. relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
  13142. relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
  13143. return relative_bucket;
  13144. }
  13145. static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
  13146. //
  13147. // set input data
  13148. //
  13149. const auto & hparams = lctx.model.hparams;
  13150. const auto & cparams = lctx.cparams;
  13151. const auto & kv_self = lctx.kv_self;
  13152. if (batch.token) {
  13153. const int64_t n_tokens = batch.n_tokens;
  13154. ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
  13155. }
  13156. if (batch.embd) {
  13157. const int64_t n_embd = hparams.n_embd;
  13158. const int64_t n_tokens = batch.n_tokens;
  13159. ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
  13160. }
  13161. if (batch.pos && lctx.inp_pos) {
  13162. const int64_t n_tokens = batch.n_tokens;
  13163. ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
  13164. }
  13165. if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
  13166. GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
  13167. const int64_t n_tokens = batch.n_tokens;
  13168. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
  13169. int32_t * data = (int32_t *) lctx.inp_out_ids->data;
  13170. if (lctx.n_outputs == n_tokens) {
  13171. for (int i = 0; i < n_tokens; ++i) {
  13172. data[i] = i;
  13173. }
  13174. } else if (batch.output) {
  13175. int32_t n_outputs = 0;
  13176. for (int i = 0; i < n_tokens; ++i) {
  13177. if (batch.output[i]) {
  13178. data[n_outputs++] = i;
  13179. }
  13180. }
  13181. // the graph needs to have been passed the correct number of outputs
  13182. GGML_ASSERT(lctx.n_outputs == n_outputs);
  13183. } else if (lctx.n_outputs == 1) {
  13184. // only keep last output
  13185. data[0] = n_tokens - 1;
  13186. } else {
  13187. GGML_ASSERT(lctx.n_outputs == 0);
  13188. }
  13189. }
  13190. GGML_ASSERT(
  13191. // (!a || b) is a logical implication (a -> b)
  13192. // !hparams.causal_attn -> !cparams.causal_attn
  13193. (hparams.causal_attn || !cparams.causal_attn) &&
  13194. "causal attention is not supported by this model"
  13195. );
  13196. if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
  13197. // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
  13198. if (cparams.causal_attn && !lctx.is_encoding) {
  13199. const int64_t n_kv = kv_self.n;
  13200. const int64_t n_tokens = batch.n_tokens;
  13201. const int64_t n_seq_tokens = batch.n_seq_tokens;
  13202. const int64_t n_seqs = batch.n_seqs;
  13203. float * data = nullptr;
  13204. float * data_swa = nullptr;
  13205. if (lctx.inp_KQ_mask) {
  13206. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  13207. data = (float *) lctx.inp_KQ_mask->data;
  13208. }
  13209. if (lctx.inp_KQ_mask_swa) {
  13210. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
  13211. data_swa = (float *) lctx.inp_KQ_mask_swa->data;
  13212. }
  13213. // For causal attention, use only the previous KV cells
  13214. // of the correct sequence for each token of the batch.
  13215. // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
  13216. for (int h = 0; h < 1; ++h) {
  13217. for (int s = 0; s < n_seqs; ++s) {
  13218. const llama_seq_id seq_id = batch.seq_id[s][0];
  13219. for (int j = 0; j < n_seq_tokens; ++j) {
  13220. const llama_pos pos = batch.pos[s*n_seq_tokens + j];
  13221. for (int i = 0; i < n_kv; ++i) {
  13222. float f;
  13223. if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
  13224. f = -INFINITY;
  13225. } else {
  13226. if (hparams.use_alibi) {
  13227. f = -std::abs(kv_self.cells[i].pos - pos);
  13228. } else {
  13229. f = 0.0f;
  13230. }
  13231. }
  13232. if (data) {
  13233. data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  13234. }
  13235. // may need to cut off old tokens for sliding window
  13236. if (data_swa) {
  13237. if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
  13238. f = -INFINITY;
  13239. }
  13240. data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  13241. }
  13242. }
  13243. }
  13244. }
  13245. if (data) {
  13246. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  13247. for (int j = 0; j < n_kv; ++j) {
  13248. data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  13249. }
  13250. }
  13251. }
  13252. if (data_swa) {
  13253. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  13254. for (int j = 0; j < n_kv; ++j) {
  13255. data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  13256. }
  13257. }
  13258. }
  13259. }
  13260. } else {
  13261. const int64_t n_tokens = batch.n_tokens;
  13262. const int64_t n_seq_tokens = batch.n_seq_tokens;
  13263. const int64_t n_seqs = batch.n_seqs;
  13264. // when using kv cache, the mask needs to match the kv cache size
  13265. const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
  13266. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  13267. float * data = (float *) lctx.inp_KQ_mask->data;
  13268. for (int h = 0; h < 1; ++h) {
  13269. for (int s1 = 0; s1 < n_seqs; ++s1) {
  13270. const llama_seq_id seq_id = batch.seq_id[s1][0];
  13271. for (int j = 0; j < n_seq_tokens; ++j) {
  13272. const int32_t tj = s1*n_seq_tokens + j;
  13273. for (int s0 = 0; s0 < n_seqs; ++s0) {
  13274. for (int i = 0; i < n_seq_tokens; ++i) {
  13275. const int32_t ti = s0*n_seq_tokens + i;
  13276. float f = -INFINITY;
  13277. for (int s = 0; s < batch.n_seq_id[s0]; ++s) {
  13278. if (batch.seq_id[s0][s] == seq_id) {
  13279. if (hparams.use_alibi) {
  13280. f = -std::abs(batch.pos[ti] - batch.pos[tj]);
  13281. } else {
  13282. f = 0.0f;
  13283. }
  13284. break;
  13285. }
  13286. }
  13287. data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
  13288. }
  13289. }
  13290. for (int i = n_tokens; i < n_stride; ++i) {
  13291. data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
  13292. }
  13293. }
  13294. }
  13295. }
  13296. }
  13297. }
  13298. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
  13299. const int64_t n_tokens = batch.n_tokens;
  13300. const int64_t n_seq_tokens = batch.n_seq_tokens;
  13301. const int64_t n_seqs = batch.n_seqs;
  13302. GGML_ASSERT(lctx.inp_mean);
  13303. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
  13304. float * data = (float *) lctx.inp_mean->data;
  13305. memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
  13306. std::vector<uint64_t> sum(n_tokens, 0);
  13307. for (int s = 0; s < n_seqs; ++s) {
  13308. const llama_seq_id seq_id = batch.seq_id[s][0];
  13309. // TODO: adapt limits to n_seqs when batch.equal_seqs is true
  13310. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
  13311. sum[seq_id] += batch.n_seq_tokens;
  13312. }
  13313. std::vector<float> div(n_tokens, 0.0f);
  13314. for (int i = 0; i < n_tokens; ++i) {
  13315. const uint64_t s = sum[i];
  13316. if (s > 0) {
  13317. div[i] = 1.0f/float(s);
  13318. }
  13319. }
  13320. for (int s = 0; s < n_seqs; ++s) {
  13321. const llama_seq_id seq_id = batch.seq_id[s][0];
  13322. for (int i = 0; i < n_seq_tokens; ++i) {
  13323. data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
  13324. }
  13325. }
  13326. }
  13327. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
  13328. const int64_t n_tokens = batch.n_tokens;
  13329. const int64_t n_seq_tokens = batch.n_seq_tokens;
  13330. const int64_t n_seqs = batch.n_seqs;
  13331. GGML_ASSERT(lctx.inp_cls);
  13332. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  13333. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  13334. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  13335. for (int s = 0; s < n_seqs; ++s) {
  13336. const llama_seq_id seq_id = batch.seq_id[s][0];
  13337. // TODO: adapt limits to n_seqs when batch.equal_seqs is true
  13338. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
  13339. for (int i = 0; i < n_seq_tokens; ++i) {
  13340. const llama_pos pos = batch.pos[s*n_seq_tokens + i];
  13341. if (pos == 0) {
  13342. data[seq_id] = s*n_seq_tokens + i;
  13343. }
  13344. }
  13345. }
  13346. }
  13347. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
  13348. const int64_t n_tokens = batch.n_tokens;
  13349. const int64_t n_seq_tokens = batch.n_seq_tokens;
  13350. const int64_t n_seqs = batch.n_seqs;
  13351. GGML_ASSERT(lctx.inp_cls);
  13352. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  13353. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  13354. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  13355. std::vector<int> last_pos(n_tokens, -1);
  13356. std::vector<int> last_row(n_tokens, -1);
  13357. for (int s = 0; s < n_seqs; ++s) {
  13358. const llama_seq_id seq_id = batch.seq_id[s][0];
  13359. // TODO: adapt limits to n_seqs when batch.equal_seqs is true
  13360. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
  13361. for (int i = 0; i < n_seq_tokens; ++i) {
  13362. const llama_pos pos = batch.pos[s*n_seq_tokens + i];
  13363. if (pos >= last_pos[seq_id]) {
  13364. last_pos[seq_id] = pos;
  13365. last_row[seq_id] = s*n_seq_tokens + i;
  13366. }
  13367. }
  13368. }
  13369. for (int i = 0; i < n_tokens; ++i) {
  13370. if (last_row[i] >= 0) {
  13371. data[i] = last_row[i];
  13372. }
  13373. }
  13374. }
  13375. if (kv_self.recurrent) {
  13376. const int64_t n_kv = kv_self.n;
  13377. if (lctx.inp_s_mask) {
  13378. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
  13379. float * data = (float *) lctx.inp_s_mask->data;
  13380. // clear unused states
  13381. for (int i = 0; i < n_kv; ++i) {
  13382. uint32_t cell_id = i + kv_self.head;
  13383. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  13384. data[i] = (float) (kv_cell.src >= 0);
  13385. // only clear once
  13386. if (kv_cell.src < 0) {
  13387. kv_cell.src = cell_id;
  13388. }
  13389. }
  13390. }
  13391. if (lctx.inp_s_copy) {
  13392. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  13393. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  13394. // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
  13395. for (uint32_t i = 0; i < n_kv; ++i) {
  13396. const uint32_t cell_id = i + kv_self.head;
  13397. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  13398. // prevent out-of-bound sources
  13399. if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
  13400. kv_cell.src = cell_id;
  13401. }
  13402. data[i] = kv_cell.src;
  13403. // ensure copy only happens once
  13404. if (kv_cell.src != (int32_t) cell_id) {
  13405. kv_cell.src = cell_id;
  13406. }
  13407. }
  13408. }
  13409. }
  13410. if (lctx.inp_pos_bucket) {
  13411. const int64_t n_tokens = batch.n_tokens;
  13412. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
  13413. GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
  13414. int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
  13415. if (!lctx.is_encoding) {
  13416. const int64_t n_kv = kv_self.n;
  13417. for (int h = 0; h < 1; ++h) {
  13418. for (int j = 0; j < n_tokens; ++j) {
  13419. for (int i = 0; i < n_kv; ++i) {
  13420. data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  13421. }
  13422. }
  13423. }
  13424. } else {
  13425. for (int h = 0; h < 1; ++h) {
  13426. for (int j = 0; j < n_tokens; ++j) {
  13427. for (int i = 0; i < n_tokens; ++i) {
  13428. data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  13429. }
  13430. }
  13431. }
  13432. }
  13433. }
  13434. if (!lctx.is_encoding && lctx.inp_embd_enc) {
  13435. assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
  13436. assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
  13437. ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
  13438. }
  13439. if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
  13440. const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
  13441. const int64_t n_tokens = batch.n_tokens;
  13442. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
  13443. GGML_ASSERT(!batch.equal_seqs); // TODO: use batch.n_seqs instead of failing
  13444. float * data = (float *) lctx.inp_KQ_mask_cross->data;
  13445. for (int h = 0; h < 1; ++h) {
  13446. for (int j = 0; j < n_tokens; ++j) {
  13447. for (int i = 0; i < n_output_enc; ++i) {
  13448. float f = -INFINITY;
  13449. for (int s = 0; s < batch.n_seq_id[j]; ++s) {
  13450. const llama_seq_id seq_id = batch.seq_id[j][s];
  13451. if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
  13452. f = 0.0f;
  13453. }
  13454. }
  13455. data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
  13456. }
  13457. }
  13458. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  13459. for (int j = 0; j < n_output_enc; ++j) {
  13460. data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
  13461. }
  13462. }
  13463. }
  13464. }
  13465. }
  13466. // Make sure enough space is available for outputs.
  13467. // Returns max number of outputs for which space was reserved.
  13468. static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
  13469. const auto & cparams = lctx.cparams;
  13470. const auto & hparams = lctx.model.hparams;
  13471. const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
  13472. const auto n_batch = cparams.n_batch;
  13473. const auto n_vocab = hparams.n_vocab;
  13474. const auto n_embd = hparams.n_embd;
  13475. // TODO: use a per-batch flag for logits presence instead
  13476. const bool has_logits = cparams.causal_attn;
  13477. const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
  13478. const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
  13479. const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
  13480. if (lctx.output_ids.empty()) {
  13481. // init, never resized afterwards
  13482. lctx.output_ids.resize(n_batch);
  13483. }
  13484. const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
  13485. const size_t new_size = (logits_size + embd_size) * sizeof(float);
  13486. // alloc only when more than the current capacity is required
  13487. // TODO: also consider shrinking the buffer
  13488. if (!lctx.buf_output || prev_size < new_size) {
  13489. if (lctx.buf_output) {
  13490. #ifndef NDEBUG
  13491. // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
  13492. LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  13493. #endif
  13494. ggml_backend_buffer_free(lctx.buf_output);
  13495. lctx.buf_output = nullptr;
  13496. lctx.logits = nullptr;
  13497. lctx.embd = nullptr;
  13498. }
  13499. lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
  13500. if (lctx.buf_output == nullptr) {
  13501. LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
  13502. return 0;
  13503. }
  13504. }
  13505. float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
  13506. lctx.logits = has_logits ? output_base : nullptr;
  13507. lctx.embd = has_embd ? output_base + logits_size : nullptr;
  13508. lctx.output_size = n_outputs_max;
  13509. lctx.logits_size = logits_size;
  13510. lctx.embd_size = embd_size;
  13511. // set all ids as invalid (negative)
  13512. std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
  13513. ggml_backend_buffer_clear(lctx.buf_output, 0);
  13514. lctx.n_outputs = 0;
  13515. return n_outputs_max;
  13516. }
  13517. // make the outputs have the same order they had in the user-provided batch
  13518. static void llama_output_reorder(struct llama_context * ctx) {
  13519. std::vector<size_t> & out_ids = ctx->sbatch.out_ids;
  13520. if (!out_ids.empty()) {
  13521. uint32_t n_vocab = ctx->model.hparams.n_vocab;
  13522. uint32_t n_embd = ctx->model.hparams.n_embd;
  13523. int32_t n_outputs = ctx->n_outputs;
  13524. GGML_ASSERT((size_t) n_outputs == out_ids.size());
  13525. // TODO: is there something more efficient which also minimizes swaps?
  13526. // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
  13527. for (int32_t i = 0; i < n_outputs - 1; ++i) {
  13528. int32_t j_min = i;
  13529. for (int32_t j = i + 1; j < n_outputs; ++j) {
  13530. if (out_ids[j] < out_ids[j_min]) {
  13531. j_min = j;
  13532. }
  13533. }
  13534. if (j_min == i) { continue; }
  13535. std::swap(out_ids[i], out_ids[j_min]);
  13536. if (ctx->logits_size > 0) {
  13537. for (uint32_t k = 0; k < n_vocab; k++) {
  13538. std::swap(ctx->logits[i*n_vocab + k], ctx->logits[j_min*n_vocab + k]);
  13539. }
  13540. }
  13541. if (ctx->embd_size > 0) {
  13542. for (uint32_t k = 0; k < n_embd; k++) {
  13543. std::swap(ctx->embd[i*n_embd + k], ctx->embd[j_min*n_embd + k]);
  13544. }
  13545. }
  13546. }
  13547. std::fill(ctx->output_ids.begin(), ctx->output_ids.end(), -1);
  13548. for (int32_t i = 0; i < n_outputs; ++i) {
  13549. ctx->output_ids[out_ids[i]] = i;
  13550. }
  13551. out_ids.clear();
  13552. }
  13553. }
  13554. static void llama_graph_compute(
  13555. llama_context & lctx,
  13556. ggml_cgraph * gf,
  13557. int n_threads,
  13558. ggml_threadpool * threadpool) {
  13559. #ifdef GGML_USE_METAL
  13560. if (ggml_backend_is_metal(lctx.backend_metal)) {
  13561. ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
  13562. }
  13563. #endif
  13564. if (lctx.backend_cpu != nullptr) {
  13565. ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
  13566. ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
  13567. ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
  13568. }
  13569. #ifdef GGML_USE_BLAS
  13570. if (lctx.backend_blas != nullptr) {
  13571. ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
  13572. }
  13573. #endif
  13574. ggml_backend_sched_graph_compute_async(lctx.sched, gf);
  13575. // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
  13576. }
  13577. // decode a batch of tokens by evaluating the transformer
  13578. //
  13579. // - lctx: llama context
  13580. // - batch: batch to evaluate
  13581. //
  13582. // return 0 on success
  13583. // return positive int on warning
  13584. // return negative int on error
  13585. //
  13586. static int llama_decode_internal(
  13587. llama_context & lctx,
  13588. llama_batch batch_all) { // TODO: rename back to batch
  13589. lctx.is_encoding = false;
  13590. const uint32_t n_tokens_all = batch_all.n_tokens;
  13591. if (n_tokens_all == 0) {
  13592. LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
  13593. return -1;
  13594. }
  13595. const auto & model = lctx.model;
  13596. const auto & hparams = model.hparams;
  13597. const auto & cparams = lctx.cparams;
  13598. GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
  13599. GGML_ASSERT(n_tokens_all <= cparams.n_batch);
  13600. GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
  13601. if (lctx.t_compute_start_us == 0) {
  13602. lctx.t_compute_start_us = ggml_time_us();
  13603. }
  13604. lctx.n_queued_tokens += n_tokens_all;
  13605. auto & kv_self = lctx.kv_self;
  13606. const int64_t n_embd = hparams.n_embd;
  13607. const int64_t n_vocab = hparams.n_vocab;
  13608. uint32_t n_outputs = 0;
  13609. uint32_t n_outputs_prev = 0;
  13610. const auto n_ubatch = cparams.n_ubatch;
  13611. // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
  13612. const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
  13613. lctx.embd_seq.clear();
  13614. // count outputs
  13615. if (batch_all.logits && !embd_pooled) {
  13616. for (uint32_t i = 0; i < n_tokens_all; ++i) {
  13617. n_outputs += batch_all.logits[i] != 0;
  13618. }
  13619. } else if (lctx.logits_all || embd_pooled) {
  13620. n_outputs = n_tokens_all;
  13621. } else {
  13622. // keep last output only
  13623. n_outputs = 1;
  13624. }
  13625. lctx.sbatch.from_batch(batch_all, n_embd,
  13626. /* simple_split */ !kv_self.recurrent,
  13627. /* logits_all */ n_outputs == n_tokens_all);
  13628. // reserve output buffer
  13629. if (llama_output_reserve(lctx, n_outputs) < n_outputs) {
  13630. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs);
  13631. return -2;
  13632. };
  13633. while (lctx.sbatch.n_tokens > 0) {
  13634. llama_ubatch ubatch;
  13635. if (kv_self.recurrent) {
  13636. if (embd_pooled) {
  13637. // Pooled embeddings cannot be split across ubatches (yet)
  13638. ubatch = lctx.sbatch.split_seq(n_ubatch);
  13639. } else {
  13640. // recurrent model architectures are easier to implement
  13641. // with equal-length sequences
  13642. ubatch = lctx.sbatch.split_equal(n_ubatch);
  13643. }
  13644. } else {
  13645. ubatch = lctx.sbatch.split_simple(n_ubatch);
  13646. }
  13647. const uint32_t n_tokens = ubatch.n_tokens;
  13648. // count the outputs in this u_batch
  13649. {
  13650. int32_t n_outputs_new = 0;
  13651. if (n_outputs == n_tokens_all) {
  13652. n_outputs_new = n_tokens;
  13653. } else {
  13654. GGML_ASSERT(ubatch.output);
  13655. for (uint32_t i = 0; i < n_tokens; i++) {
  13656. n_outputs_new += (int32_t) (ubatch.output[i] != 0);
  13657. }
  13658. }
  13659. // needs to happen before the graph is built
  13660. lctx.n_outputs = n_outputs_new;
  13661. }
  13662. int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
  13663. ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
  13664. GGML_ASSERT(n_threads > 0);
  13665. // non-causal masks do not use the KV cache
  13666. if (hparams.causal_attn) {
  13667. llama_kv_cache_update(&lctx);
  13668. // if we have enough unused cells before the current head ->
  13669. // better to start searching from the beginning of the cache, hoping to fill it
  13670. if (kv_self.head > kv_self.used + 2*n_tokens) {
  13671. kv_self.head = 0;
  13672. }
  13673. if (!llama_kv_cache_find_slot(kv_self, ubatch)) {
  13674. return 1;
  13675. }
  13676. if (!kv_self.recurrent) {
  13677. // a heuristic, to avoid attending the full cache if it is not yet utilized
  13678. // after enough generations, the benefit from this heuristic disappears
  13679. // if we start defragmenting the cache, the benefit from this will be more important
  13680. const uint32_t pad = llama_kv_cache_get_padding(cparams);
  13681. kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
  13682. //kv_self.n = llama_kv_cache_cell_max(kv_self);
  13683. }
  13684. }
  13685. //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
  13686. ggml_backend_sched_reset(lctx.sched);
  13687. ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  13688. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  13689. // the output is always the last tensor in the graph
  13690. struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
  13691. struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
  13692. if (lctx.n_outputs == 0) {
  13693. // no output
  13694. res = nullptr;
  13695. embd = nullptr;
  13696. }
  13697. if (cparams.embeddings) {
  13698. for (int i = gf->n_nodes - 1; i >= 0; --i) {
  13699. embd = gf->nodes[i];
  13700. if (strcmp(embd->name, "result_embd_pooled") == 0) {
  13701. break;
  13702. }
  13703. }
  13704. } else {
  13705. embd = nullptr; // do not extract embeddings when not needed
  13706. GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
  13707. }
  13708. if (!cparams.causal_attn) {
  13709. res = nullptr; // do not extract logits when not needed
  13710. }
  13711. // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  13712. ggml_backend_sched_alloc_graph(lctx.sched, gf);
  13713. llama_set_inputs(lctx, ubatch);
  13714. llama_graph_compute(lctx, gf, n_threads, threadpool);
  13715. // update the kv ring buffer
  13716. {
  13717. kv_self.head += n_tokens;
  13718. // Ensure kv cache head points to a valid index.
  13719. if (kv_self.head >= kv_self.size) {
  13720. kv_self.head = 0;
  13721. }
  13722. }
  13723. // plot the computation graph in dot format (for debugging purposes)
  13724. //if (n_past%100 == 0) {
  13725. // ggml_graph_dump_dot(gf, NULL, "llama.dot");
  13726. //}
  13727. // extract logits
  13728. if (res) {
  13729. ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
  13730. GGML_ASSERT(backend_res != nullptr);
  13731. GGML_ASSERT(lctx.logits != nullptr);
  13732. float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
  13733. const int32_t n_outputs_new = lctx.n_outputs;
  13734. if (n_outputs_new) {
  13735. GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
  13736. GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
  13737. ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
  13738. }
  13739. }
  13740. // extract embeddings
  13741. if (embd) {
  13742. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
  13743. GGML_ASSERT(backend_embd != nullptr);
  13744. switch (cparams.pooling_type) {
  13745. case LLAMA_POOLING_TYPE_NONE:
  13746. {
  13747. // extract token embeddings
  13748. GGML_ASSERT(lctx.embd != nullptr);
  13749. float * embd_out = lctx.embd + n_outputs_prev*n_embd;
  13750. const int32_t n_outputs_new = lctx.n_outputs;
  13751. if (n_outputs_new) {
  13752. GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
  13753. GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size);
  13754. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
  13755. }
  13756. } break;
  13757. case LLAMA_POOLING_TYPE_MEAN:
  13758. case LLAMA_POOLING_TYPE_CLS:
  13759. case LLAMA_POOLING_TYPE_LAST:
  13760. {
  13761. // extract sequence embeddings (cleared before processing each batch)
  13762. auto & embd_seq_out = lctx.embd_seq;
  13763. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  13764. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  13765. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  13766. continue;
  13767. }
  13768. embd_seq_out[seq_id].resize(n_embd);
  13769. ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  13770. }
  13771. } break;
  13772. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  13773. {
  13774. GGML_ABORT("unknown pooling type");
  13775. }
  13776. }
  13777. }
  13778. n_outputs_prev += lctx.n_outputs;
  13779. }
  13780. // set output mappings
  13781. {
  13782. bool sorted_output = true;
  13783. GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs);
  13784. for (size_t i = 0; i < n_outputs; ++i) {
  13785. size_t out_id = lctx.sbatch.out_ids[i];
  13786. lctx.output_ids[out_id] = i;
  13787. if (out_id != i) {
  13788. sorted_output = false;
  13789. }
  13790. }
  13791. if (sorted_output) {
  13792. lctx.sbatch.out_ids.clear();
  13793. }
  13794. }
  13795. // set to total number of outputs in the batch, for use in llama_get_logits_ith
  13796. lctx.n_outputs = n_outputs;
  13797. // wait for the computation to finish (automatically done when obtaining the model output)
  13798. //llama_synchronize(&lctx);
  13799. // decide if we need to defrag the kv cache
  13800. if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
  13801. const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
  13802. // queue defragmentation for next llama_kv_cache_update
  13803. if (fragmentation > cparams.defrag_thold) {
  13804. //LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
  13805. llama_kv_cache_defrag(kv_self);
  13806. }
  13807. }
  13808. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  13809. // overlap with device computation.
  13810. ggml_backend_sched_reset(lctx.sched);
  13811. return 0;
  13812. }
  13813. // encode a batch of tokens by evaluating the encoder part of the transformer
  13814. //
  13815. // - lctx: llama context
  13816. // - batch: batch to evaluate
  13817. //
  13818. // return 0 on success
  13819. // return positive int on warning
  13820. // return negative int on error
  13821. //
  13822. static int llama_encode_internal(
  13823. llama_context & lctx,
  13824. llama_batch batch) {
  13825. lctx.is_encoding = true;
  13826. const uint32_t n_tokens = batch.n_tokens;
  13827. if (n_tokens == 0) {
  13828. LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
  13829. return -1;
  13830. }
  13831. const auto & model = lctx.model;
  13832. const auto & hparams = model.hparams;
  13833. const auto & cparams = lctx.cparams;
  13834. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  13835. // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
  13836. GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
  13837. if (lctx.t_compute_start_us == 0) {
  13838. lctx.t_compute_start_us = ggml_time_us();
  13839. }
  13840. lctx.n_queued_tokens += n_tokens;
  13841. const int64_t n_embd = hparams.n_embd;
  13842. lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
  13843. const llama_ubatch ubatch = lctx.sbatch.split_simple(n_tokens);
  13844. // reserve output buffer
  13845. if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
  13846. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
  13847. return -2;
  13848. };
  13849. for (uint32_t i = 0; i < n_tokens; ++i) {
  13850. lctx.output_ids[i] = i;
  13851. }
  13852. lctx.inp_embd_enc = NULL;
  13853. lctx.n_outputs = n_tokens;
  13854. int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
  13855. ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
  13856. GGML_ASSERT(n_threads > 0);
  13857. ggml_backend_sched_reset(lctx.sched);
  13858. ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
  13859. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  13860. // the output embeddings after the final encoder normalization
  13861. struct ggml_tensor * embd = nullptr;
  13862. // there are two cases here
  13863. if (llama_model_has_decoder(&lctx.model)) {
  13864. // first case is an encoder-decoder T5 model where embeddings are passed to decoder
  13865. embd = gf->nodes[gf->n_nodes - 1];
  13866. GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
  13867. } else {
  13868. // second case is an encoder-only T5 model
  13869. if (cparams.embeddings) {
  13870. // only output embeddings if required
  13871. embd = gf->nodes[gf->n_nodes - 1];
  13872. if (strcmp(embd->name, "result_embd_pooled") != 0) {
  13873. embd = gf->nodes[gf->n_nodes - 2];
  13874. }
  13875. GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
  13876. }
  13877. }
  13878. ggml_backend_sched_alloc_graph(lctx.sched, gf);
  13879. llama_set_inputs(lctx, ubatch);
  13880. llama_graph_compute(lctx, gf, n_threads, threadpool);
  13881. // extract embeddings
  13882. if (embd) {
  13883. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
  13884. GGML_ASSERT(backend_embd != nullptr);
  13885. if (llama_model_has_decoder(&lctx.model)) {
  13886. lctx.embd_enc.resize(n_tokens*n_embd);
  13887. float * embd_out = lctx.embd_enc.data();
  13888. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
  13889. GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
  13890. // remember the sequence ids used during the encoding - needed for cross attention later
  13891. lctx.seq_ids_enc.resize(n_tokens);
  13892. for (uint32_t i = 0; i < n_tokens; i++) {
  13893. for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
  13894. llama_seq_id seq_id = ubatch.seq_id[i][s];
  13895. lctx.seq_ids_enc[i].insert(seq_id);
  13896. }
  13897. }
  13898. } else {
  13899. GGML_ASSERT(lctx.embd != nullptr);
  13900. switch (cparams.pooling_type) {
  13901. case LLAMA_POOLING_TYPE_NONE:
  13902. {
  13903. // extract token embeddings
  13904. GGML_ASSERT(lctx.embd != nullptr);
  13905. float * embd_out = lctx.embd;
  13906. GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
  13907. ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
  13908. } break;
  13909. case LLAMA_POOLING_TYPE_MEAN:
  13910. case LLAMA_POOLING_TYPE_CLS:
  13911. case LLAMA_POOLING_TYPE_LAST:
  13912. {
  13913. // extract sequence embeddings
  13914. auto & embd_seq_out = lctx.embd_seq;
  13915. embd_seq_out.clear();
  13916. GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
  13917. for (uint32_t i = 0; i < n_tokens; i++) {
  13918. const llama_seq_id seq_id = ubatch.seq_id[i][0];
  13919. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  13920. continue;
  13921. }
  13922. embd_seq_out[seq_id].resize(n_embd);
  13923. ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  13924. }
  13925. } break;
  13926. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  13927. {
  13928. GGML_ABORT("unknown pooling type");
  13929. }
  13930. }
  13931. }
  13932. }
  13933. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  13934. // overlap with device computation.
  13935. ggml_backend_sched_reset(lctx.sched);
  13936. return 0;
  13937. }
  13938. // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
  13939. static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
  13940. auto & kv_self = lctx.kv_self;
  13941. const auto & hparams = lctx.model.hparams;
  13942. const uint32_t n_layer = hparams.n_layer;
  13943. const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
  13944. const uint32_t n_used = kv_self.used;
  13945. assert(n_used <= n_kv);
  13946. //const int64_t t_start = ggml_time_us();
  13947. // number of cells moved
  13948. uint32_t n_moves = 0;
  13949. // each move requires 6*n_layer tensors (see build_defrag)
  13950. // - source view, destination view, copy operation
  13951. // - x2 for keys and values
  13952. //const uint32_t max_moves = llama_model_max_nodes(model)/(6*n_layer);
  13953. // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
  13954. const uint32_t max_moves = (llama_model_max_nodes(lctx.model) - 2*n_layer)/(6*n_layer);
  13955. // determine which KV cells to move where
  13956. //
  13957. // cell i moves to ids[i]
  13958. //
  13959. // if ids[i] == i || ids[i] == n_kv, then cell i is not moved
  13960. //
  13961. std::vector<uint32_t> ids(n_kv, n_kv);
  13962. for (uint32_t i0 = 0; i0 < n_used; ++i0) {
  13963. const auto & cell0 = kv_self.cells[i0];
  13964. if (!cell0.is_empty()) {
  13965. ids[i0] = i0;
  13966. continue;
  13967. }
  13968. // found a hole - fill it with data from the end of the cache
  13969. uint32_t nh = 1;
  13970. // determine the size of the hole
  13971. while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
  13972. nh++;
  13973. }
  13974. uint32_t nf = 0;
  13975. uint32_t is = n_kv - 1;
  13976. // starting from the end, find nh non-empty cells
  13977. for (; is > i0; --is) {
  13978. const auto & cell1 = kv_self.cells[is];
  13979. if (cell1.is_empty() || ids[is] != n_kv) {
  13980. continue;
  13981. }
  13982. // non-empty cell which is not yet moved
  13983. nf++;
  13984. if (nf == nh) {
  13985. break;
  13986. }
  13987. }
  13988. // this can only happen if `n_used` is not accurate, which would be a bug
  13989. GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
  13990. nf = 0;
  13991. uint32_t i1 = is;
  13992. // are we moving a continuous block of memory?
  13993. bool cont = false;
  13994. // should we stop searching for the next move?
  13995. bool stop = false;
  13996. // go back and move the nf cells to the hole
  13997. for (; i1 < n_kv; ++i1) {
  13998. auto & cell1 = kv_self.cells[i1];
  13999. if (cell1.is_empty() || ids[i1] != n_kv) {
  14000. if (n_moves == max_moves) {
  14001. stop = true;
  14002. break;
  14003. }
  14004. cont = false;
  14005. continue;
  14006. }
  14007. // this cell goes to (i0 + nf)
  14008. ids[i1] = i0 + nf;
  14009. // move the cell meta data
  14010. kv_self.cells[i0 + nf] = cell1;
  14011. // clear the old cell and move the head there
  14012. cell1 = llama_kv_cell();
  14013. kv_self.head = n_used;
  14014. if (!cont) {
  14015. n_moves++;
  14016. cont = true;
  14017. }
  14018. nf++;
  14019. if (nf == nh) {
  14020. break;
  14021. }
  14022. }
  14023. if (stop || n_moves == max_moves) {
  14024. break;
  14025. }
  14026. //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
  14027. i0 += nh - 1;
  14028. }
  14029. if (n_moves == 0) {
  14030. return;
  14031. }
  14032. //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
  14033. //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
  14034. #if 0
  14035. // CPU defrag
  14036. //
  14037. // TODO: optimizations are possible:
  14038. // - multiple threads
  14039. // - avoid copying to the host memory when already there
  14040. //
  14041. // likely not worth the effort, as we have ggml_graph based defrag
  14042. //
  14043. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
  14044. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
  14045. const uint32_t kv_size = kv_self.size;
  14046. std::vector<uint8_t> buf_k;
  14047. std::vector<uint8_t> buf_v;
  14048. for (uint32_t il = 0; il < n_layer; ++il) {
  14049. const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  14050. const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
  14051. const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  14052. const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
  14053. buf_k.resize(k_size);
  14054. buf_v.resize(v_size);
  14055. ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
  14056. ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
  14057. // batch move [i, i+nm) to [id, id+nm)
  14058. // note: cells can move only to a lower index
  14059. for (uint32_t i = 0; i < n_kv; ++i) {
  14060. const uint32_t id = ids[i];
  14061. if (i == id || id == n_kv) {
  14062. continue;
  14063. }
  14064. uint32_t nm = 1;
  14065. while (i + nm < n_kv && ids[i + nm] == id + nm) {
  14066. nm++;
  14067. }
  14068. // move keys
  14069. {
  14070. const int64_t os = i*k_size_row;
  14071. const int64_t od = id*k_size_row;
  14072. memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
  14073. }
  14074. // move values (note: they are transposed)
  14075. {
  14076. const int64_t os = i;
  14077. const int64_t od = id;
  14078. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  14079. memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
  14080. }
  14081. }
  14082. i += nm - 1;
  14083. }
  14084. ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
  14085. ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
  14086. }
  14087. #else
  14088. // ggml_graph defrag
  14089. ggml_backend_sched_reset(lctx.sched);
  14090. ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
  14091. llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  14092. #endif
  14093. //const int64_t t_end = ggml_time_us();
  14094. //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
  14095. }
  14096. static void llama_kv_cache_update_internal(struct llama_context & lctx) {
  14097. bool need_reserve = false;
  14098. // apply K-shift if needed
  14099. if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
  14100. if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
  14101. GGML_ABORT("Deepseek2 does not support K-shift");
  14102. }
  14103. {
  14104. ggml_backend_sched_reset(lctx.sched);
  14105. ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
  14106. ggml_backend_sched_alloc_graph(lctx.sched, gf);
  14107. llama_set_k_shift(lctx);
  14108. llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
  14109. need_reserve = true;
  14110. }
  14111. {
  14112. auto & kv_self = lctx.kv_self;
  14113. kv_self.has_shift = false;
  14114. for (uint32_t i = 0; i < kv_self.size; ++i) {
  14115. kv_self.cells[i].delta = 0;
  14116. }
  14117. }
  14118. }
  14119. // defragment the KV cache if needed
  14120. if (lctx.kv_self.do_defrag) {
  14121. llama_kv_cache_defrag_internal(lctx);
  14122. need_reserve = true;
  14123. lctx.kv_self.do_defrag = false;
  14124. }
  14125. // reserve a worst case graph again
  14126. if (need_reserve) {
  14127. // TODO: extract to a function
  14128. // build worst-case graph
  14129. uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  14130. uint32_t n_tokens = std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
  14131. llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  14132. llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  14133. ggml_cgraph * gf = llama_build_graph(lctx, ubatch, true);
  14134. // initialize scheduler with the worst-case graph
  14135. ggml_backend_sched_reset(lctx.sched);
  14136. if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
  14137. LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
  14138. }
  14139. }
  14140. }
  14141. //
  14142. // quantization
  14143. //
  14144. struct quantize_state_internal {
  14145. const llama_model & model;
  14146. const llama_model_quantize_params * params;
  14147. int n_attention_wv = 0;
  14148. int n_ffn_down = 0;
  14149. int n_ffn_gate = 0;
  14150. int n_ffn_up = 0;
  14151. int i_attention_wv = 0;
  14152. int i_ffn_down = 0;
  14153. int i_ffn_gate = 0;
  14154. int i_ffn_up = 0;
  14155. int n_k_quantized = 0;
  14156. int n_fallback = 0;
  14157. bool has_imatrix = false;
  14158. // used to figure out if a model shares tok_embd with the output weight
  14159. bool has_output = false;
  14160. quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
  14161. : model(model)
  14162. , params(params)
  14163. {}
  14164. };
  14165. static void llama_tensor_dequantize_internal(
  14166. struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
  14167. const size_t nelements, const int nthread
  14168. ) {
  14169. if (output.size() < nelements) {
  14170. output.resize(nelements);
  14171. }
  14172. float * f32_output = (float *) output.data();
  14173. ggml_type_traits_t qtype;
  14174. if (ggml_is_quantized(tensor->type)) {
  14175. qtype = ggml_internal_get_type_traits(tensor->type);
  14176. if (qtype.to_float == NULL) {
  14177. throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
  14178. }
  14179. } else if (tensor->type != GGML_TYPE_F16 &&
  14180. tensor->type != GGML_TYPE_BF16) {
  14181. throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
  14182. }
  14183. if (nthread < 2) {
  14184. if (tensor->type == GGML_TYPE_F16) {
  14185. ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
  14186. } else if (tensor->type == GGML_TYPE_BF16) {
  14187. ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
  14188. } else if (ggml_is_quantized(tensor->type)) {
  14189. qtype.to_float(tensor->data, f32_output, nelements);
  14190. } else {
  14191. GGML_ABORT("fatal error"); // unreachable
  14192. }
  14193. return;
  14194. }
  14195. size_t block_size;
  14196. if (tensor->type == GGML_TYPE_F16 ||
  14197. tensor->type == GGML_TYPE_BF16) {
  14198. block_size = 1;
  14199. } else {
  14200. block_size = (size_t)ggml_blck_size(tensor->type);
  14201. }
  14202. size_t block_size_bytes = ggml_type_size(tensor->type);
  14203. GGML_ASSERT(nelements % block_size == 0);
  14204. size_t nblocks = nelements / block_size;
  14205. size_t blocks_per_thread = nblocks / nthread;
  14206. size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
  14207. size_t in_buff_offs = 0;
  14208. size_t out_buff_offs = 0;
  14209. for (int tnum = 0; tnum < nthread; tnum++) {
  14210. size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
  14211. size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
  14212. size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
  14213. auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
  14214. if (typ == GGML_TYPE_F16) {
  14215. ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
  14216. } else if (typ == GGML_TYPE_BF16) {
  14217. ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
  14218. } else {
  14219. qtype.to_float(inbuf, outbuf, nels);
  14220. }
  14221. };
  14222. workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
  14223. in_buff_offs += thr_block_bytes;
  14224. out_buff_offs += thr_elems;
  14225. }
  14226. for (auto & w : workers) { w.join(); }
  14227. workers.clear();
  14228. }
  14229. static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
  14230. const std::string name = ggml_get_name(tensor);
  14231. // TODO: avoid hardcoded tensor names - use the TN_* constants
  14232. const llm_arch arch = qs.model.arch;
  14233. const auto tn = LLM_TN(arch);
  14234. auto use_more_bits = [](int i_layer, int n_layers) -> bool {
  14235. return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
  14236. };
  14237. const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
  14238. auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
  14239. if (n_expert > 1) {
  14240. // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
  14241. // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
  14242. // for getting the current layer as I initially thought, and we need to resort to parsing the
  14243. // tensor name.
  14244. if (sscanf(name, "blk.%d.", &i_layer) != 1) {
  14245. throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
  14246. }
  14247. if (i_layer < 0 || i_layer >= n_layer) {
  14248. throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
  14249. }
  14250. }
  14251. return std::make_pair(i_layer, n_layer);
  14252. };
  14253. // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
  14254. // with the quantization of the output tensor
  14255. if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
  14256. if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
  14257. new_type = qs.params->output_tensor_type;
  14258. } else {
  14259. int nx = tensor->ne[0];
  14260. if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
  14261. new_type = GGML_TYPE_Q8_0;
  14262. }
  14263. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
  14264. ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
  14265. ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  14266. new_type = GGML_TYPE_Q5_K;
  14267. }
  14268. else if (new_type != GGML_TYPE_Q8_0) {
  14269. new_type = GGML_TYPE_Q6_K;
  14270. }
  14271. }
  14272. } else if (name == "token_embd.weight") {
  14273. if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
  14274. new_type = qs.params->token_embedding_type;
  14275. } else {
  14276. if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
  14277. ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  14278. new_type = GGML_TYPE_Q2_K;
  14279. }
  14280. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
  14281. new_type = GGML_TYPE_IQ3_S;
  14282. }
  14283. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  14284. new_type = GGML_TYPE_IQ3_S;
  14285. }
  14286. else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
  14287. new_type == GGML_TYPE_Q4_0_8_8) {
  14288. new_type = GGML_TYPE_Q4_0;
  14289. }
  14290. }
  14291. } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
  14292. ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  14293. if (name.find("attn_v.weight") != std::string::npos) {
  14294. if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
  14295. else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
  14296. ++qs.i_attention_wv;
  14297. }
  14298. else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
  14299. new_type = GGML_TYPE_Q4_K;
  14300. }
  14301. else if (name.find("ffn_down") != std::string::npos) {
  14302. if (qs.i_ffn_down < qs.n_ffn_down/8) {
  14303. new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
  14304. }
  14305. ++qs.i_ffn_down;
  14306. }
  14307. else if (name.find("attn_output.weight") != std::string::npos) {
  14308. if (qs.model.hparams.n_expert == 8) {
  14309. new_type = GGML_TYPE_Q5_K;
  14310. } else {
  14311. if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
  14312. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
  14313. }
  14314. }
  14315. } else if (name.find("attn_v.weight") != std::string::npos) {
  14316. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
  14317. new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
  14318. }
  14319. else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
  14320. new_type = GGML_TYPE_Q4_K;
  14321. }
  14322. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  14323. new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
  14324. }
  14325. else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
  14326. new_type = GGML_TYPE_Q4_K;
  14327. }
  14328. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
  14329. new_type = GGML_TYPE_Q4_K;
  14330. }
  14331. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  14332. new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  14333. }
  14334. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
  14335. else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
  14336. new_type = GGML_TYPE_Q5_K;
  14337. }
  14338. else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
  14339. use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
  14340. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
  14341. if (qs.model.type == MODEL_70B) {
  14342. // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
  14343. // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
  14344. // nearly negligible increase in model size by quantizing this tensor with more bits:
  14345. if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
  14346. }
  14347. if (qs.model.hparams.n_expert == 8) {
  14348. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  14349. // TODO: explore better strategies
  14350. new_type = GGML_TYPE_Q8_0;
  14351. }
  14352. ++qs.i_attention_wv;
  14353. } else if (name.find("attn_k.weight") != std::string::npos) {
  14354. if (qs.model.hparams.n_expert == 8) {
  14355. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  14356. // TODO: explore better strategies
  14357. new_type = GGML_TYPE_Q8_0;
  14358. }
  14359. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
  14360. new_type = GGML_TYPE_IQ3_XXS;
  14361. }
  14362. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  14363. new_type = GGML_TYPE_IQ2_S;
  14364. }
  14365. } else if (name.find("attn_q.weight") != std::string::npos) {
  14366. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
  14367. new_type = GGML_TYPE_IQ3_XXS;
  14368. }
  14369. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  14370. new_type = GGML_TYPE_IQ2_S;
  14371. }
  14372. } else if (name.find("ffn_down") != std::string::npos) {
  14373. auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
  14374. int i_layer = info.first, n_layer = info.second;
  14375. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  14376. else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
  14377. if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
  14378. }
  14379. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
  14380. new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
  14381. }
  14382. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  14383. new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
  14384. : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
  14385. : GGML_TYPE_Q3_K;
  14386. }
  14387. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
  14388. (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
  14389. new_type = GGML_TYPE_Q4_K;
  14390. }
  14391. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
  14392. new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
  14393. }
  14394. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
  14395. if (arch == LLM_ARCH_FALCON) {
  14396. new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
  14397. use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  14398. } else {
  14399. if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
  14400. }
  14401. }
  14402. else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
  14403. new_type = GGML_TYPE_Q5_K;
  14404. }
  14405. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
  14406. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
  14407. new_type = GGML_TYPE_Q5_K;
  14408. }
  14409. else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
  14410. && qs.has_imatrix && i_layer < n_layer/8) {
  14411. // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
  14412. // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
  14413. // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
  14414. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
  14415. }
  14416. ++qs.i_ffn_down;
  14417. } else if (name.find("attn_output.weight") != std::string::npos) {
  14418. if (arch != LLM_ARCH_FALCON) {
  14419. if (qs.model.hparams.n_expert == 8) {
  14420. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
  14421. ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
  14422. ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
  14423. ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
  14424. new_type = GGML_TYPE_Q5_K;
  14425. }
  14426. } else {
  14427. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
  14428. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
  14429. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
  14430. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
  14431. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
  14432. }
  14433. } else {
  14434. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
  14435. }
  14436. }
  14437. else if (name.find("attn_qkv.weight") != std::string::npos) {
  14438. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
  14439. new_type = GGML_TYPE_Q4_K;
  14440. }
  14441. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
  14442. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
  14443. }
  14444. else if (name.find("ffn_gate") != std::string::npos) {
  14445. auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
  14446. int i_layer = info.first, n_layer = info.second;
  14447. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
  14448. new_type = GGML_TYPE_IQ3_XXS;
  14449. }
  14450. ++qs.i_ffn_gate;
  14451. }
  14452. else if (name.find("ffn_up") != std::string::npos) {
  14453. auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
  14454. int i_layer = info.first, n_layer = info.second;
  14455. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
  14456. new_type = GGML_TYPE_IQ3_XXS;
  14457. }
  14458. ++qs.i_ffn_up;
  14459. }
  14460. // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  14461. //}
  14462. // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
  14463. //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
  14464. // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  14465. //}
  14466. // This can be used to reduce the size of the Q5_K_S model.
  14467. // The associated PPL increase is fully in line with the size reduction
  14468. //else {
  14469. // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
  14470. //}
  14471. bool convert_incompatible_tensor = false;
  14472. if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
  14473. new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
  14474. new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
  14475. new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
  14476. new_type == GGML_TYPE_IQ1_M) {
  14477. int nx = tensor->ne[0];
  14478. int ny = tensor->ne[1];
  14479. if (nx % QK_K != 0) {
  14480. LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
  14481. convert_incompatible_tensor = true;
  14482. } else {
  14483. ++qs.n_k_quantized;
  14484. }
  14485. }
  14486. if (convert_incompatible_tensor) {
  14487. switch (new_type) {
  14488. case GGML_TYPE_IQ2_XXS:
  14489. case GGML_TYPE_IQ2_XS:
  14490. case GGML_TYPE_IQ2_S:
  14491. case GGML_TYPE_IQ3_XXS:
  14492. case GGML_TYPE_IQ3_S:
  14493. case GGML_TYPE_IQ1_S:
  14494. case GGML_TYPE_IQ1_M:
  14495. case GGML_TYPE_Q2_K:
  14496. case GGML_TYPE_Q3_K:
  14497. case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
  14498. case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
  14499. case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
  14500. case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
  14501. default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
  14502. }
  14503. if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
  14504. new_type = GGML_TYPE_F16;
  14505. }
  14506. LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
  14507. ++qs.n_fallback;
  14508. }
  14509. return new_type;
  14510. }
  14511. static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
  14512. if (nthread < 2) {
  14513. // single-thread
  14514. size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
  14515. if (!ggml_validate_row_data(new_type, new_data, new_size)) {
  14516. throw std::runtime_error("quantized data validation failed");
  14517. }
  14518. return new_size;
  14519. }
  14520. std::mutex mutex;
  14521. int64_t counter = 0;
  14522. size_t new_size = 0;
  14523. bool valid = true;
  14524. auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
  14525. nrows, n_per_row, imatrix]() {
  14526. const int64_t nrows_per_chunk = chunk_size / n_per_row;
  14527. size_t local_size = 0;
  14528. while (true) {
  14529. std::unique_lock<std::mutex> lock(mutex);
  14530. int64_t first_row = counter; counter += nrows_per_chunk;
  14531. if (first_row >= nrows) {
  14532. if (local_size > 0) {
  14533. new_size += local_size;
  14534. }
  14535. break;
  14536. }
  14537. lock.unlock();
  14538. const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
  14539. size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
  14540. local_size += this_size;
  14541. // validate the quantized data
  14542. const size_t row_size = ggml_row_size(new_type, n_per_row);
  14543. void * this_data = (char *) new_data + first_row * row_size;
  14544. if (!ggml_validate_row_data(new_type, this_data, this_size)) {
  14545. std::unique_lock<std::mutex> lock(mutex);
  14546. valid = false;
  14547. break;
  14548. }
  14549. }
  14550. };
  14551. for (int it = 0; it < nthread - 1; ++it) {
  14552. workers.emplace_back(compute);
  14553. }
  14554. compute();
  14555. for (auto & w : workers) { w.join(); }
  14556. workers.clear();
  14557. if (!valid) {
  14558. throw std::runtime_error("quantized data validation failed");
  14559. }
  14560. return new_size;
  14561. }
  14562. static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
  14563. ggml_type default_type;
  14564. llama_ftype ftype = params->ftype;
  14565. switch (params->ftype) {
  14566. case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
  14567. case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
  14568. case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
  14569. case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
  14570. case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
  14571. case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
  14572. case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
  14573. case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
  14574. // K-quants
  14575. case LLAMA_FTYPE_MOSTLY_Q2_K_S:
  14576. case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
  14577. case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
  14578. case LLAMA_FTYPE_MOSTLY_Q3_K_S:
  14579. case LLAMA_FTYPE_MOSTLY_Q3_K_M:
  14580. case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
  14581. case LLAMA_FTYPE_MOSTLY_Q4_K_S:
  14582. case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
  14583. case LLAMA_FTYPE_MOSTLY_Q5_K_S:
  14584. case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
  14585. case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
  14586. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
  14587. case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
  14588. case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
  14589. case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
  14590. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
  14591. case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
  14592. case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
  14593. case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
  14594. case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
  14595. case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
  14596. case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
  14597. case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
  14598. case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
  14599. case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
  14600. default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
  14601. }
  14602. int nthread = params->nthread;
  14603. if (nthread <= 0) {
  14604. nthread = std::thread::hardware_concurrency();
  14605. }
  14606. // mmap consistently increases speed Linux, and also increases speed on Windows with
  14607. // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
  14608. #if defined(__linux__) || defined(_WIN32)
  14609. constexpr bool use_mmap = true;
  14610. #else
  14611. constexpr bool use_mmap = false;
  14612. #endif
  14613. llama_model_kv_override * kv_overrides = nullptr;
  14614. if (params->kv_overrides) {
  14615. auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
  14616. kv_overrides = v->data();
  14617. }
  14618. llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
  14619. ml.init_mappings(false); // no prefetching
  14620. llama_model model;
  14621. llm_load_arch(ml, model);
  14622. llm_load_hparams(ml, model);
  14623. struct quantize_state_internal qs(model, params);
  14624. if (params->only_copy) {
  14625. ftype = model.ftype;
  14626. }
  14627. const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
  14628. if (params->imatrix) {
  14629. imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
  14630. if (imatrix_data) {
  14631. LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
  14632. qs.has_imatrix = true;
  14633. // check imatrix for nans or infs
  14634. for (const auto & kv : *imatrix_data) {
  14635. for (float f : kv.second) {
  14636. if (!std::isfinite(f)) {
  14637. throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
  14638. }
  14639. }
  14640. }
  14641. }
  14642. }
  14643. const size_t align = GGUF_DEFAULT_ALIGNMENT;
  14644. struct gguf_context * ctx_out = gguf_init_empty();
  14645. // copy the KV pairs from the input file
  14646. gguf_set_kv (ctx_out, ml.meta);
  14647. gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
  14648. gguf_set_val_u32(ctx_out, "general.file_type", ftype); // TODO: use LLM_KV
  14649. // Remove split metadata
  14650. gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
  14651. gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
  14652. gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
  14653. if (params->kv_overrides) {
  14654. const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
  14655. for (auto & o : overrides) {
  14656. if (o.key[0] == 0) break;
  14657. if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
  14658. gguf_set_val_f32(ctx_out, o.key, o.val_f64);
  14659. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
  14660. gguf_set_val_i32(ctx_out, o.key, o.val_i64);
  14661. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
  14662. gguf_set_val_bool(ctx_out, o.key, o.val_bool);
  14663. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
  14664. gguf_set_val_str(ctx_out, o.key, o.val_str);
  14665. } else {
  14666. LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
  14667. }
  14668. }
  14669. }
  14670. for (int i = 0; i < ml.n_tensors; ++i) {
  14671. const struct ggml_tensor * meta = ml.get_tensor_meta(i);
  14672. const std::string name = ggml_get_name(meta);
  14673. // TODO: avoid hardcoded tensor names - use the TN_* constants
  14674. if (name.find("attn_v.weight") != std::string::npos ||
  14675. name.find("attn_qkv.weight") != std::string::npos ||
  14676. name.find("attn_kv_b.weight")!= std::string::npos) {
  14677. ++qs.n_attention_wv;
  14678. } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
  14679. qs.has_output = true;
  14680. }
  14681. }
  14682. qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
  14683. // sanity checks
  14684. {
  14685. const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
  14686. // attention layers have a non-zero number of kv heads
  14687. int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
  14688. if (llama_model_has_encoder(&model)) {
  14689. n_attn_layer *= 3;
  14690. }
  14691. GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
  14692. }
  14693. size_t total_size_org = 0;
  14694. size_t total_size_new = 0;
  14695. std::vector<std::thread> workers;
  14696. workers.reserve(nthread);
  14697. int idx = 0;
  14698. std::vector<no_init<uint8_t>> read_data;
  14699. std::vector<no_init<uint8_t>> work;
  14700. std::vector<no_init<float>> f32_conv_buf;
  14701. uint16_t n_split = 1;
  14702. // Assume split index is continuous
  14703. if (params->keep_split) {
  14704. for (int i = 0; i < ml.n_tensors; ++i) {
  14705. n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
  14706. }
  14707. }
  14708. std::vector<gguf_context*> ctx_outs(n_split, NULL);
  14709. ctx_outs[0] = ctx_out;
  14710. // populate the original tensors so we get an initial meta data
  14711. for (int i = 0; i < ml.n_tensors; ++i) {
  14712. auto weight = ml.get_weight(i);
  14713. uint16_t i_split = params->keep_split ? weight->idx : 0;
  14714. struct ggml_tensor * tensor = weight->tensor;
  14715. if (ctx_outs[i_split] == NULL) {
  14716. ctx_outs[i_split] = gguf_init_empty();
  14717. }
  14718. gguf_add_tensor(ctx_outs[i_split], tensor);
  14719. }
  14720. // Set split info if needed
  14721. if (n_split > 1) {
  14722. for (size_t i = 0; i < ctx_outs.size(); ++i) {
  14723. gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
  14724. gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
  14725. gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
  14726. }
  14727. }
  14728. int cur_split = -1;
  14729. std::ofstream fout;
  14730. auto close_ofstream = [&]() {
  14731. // Write metadata and close file handler
  14732. if (fout.is_open()) {
  14733. fout.seekp(0);
  14734. std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
  14735. gguf_get_meta_data(ctx_outs[cur_split], data.data());
  14736. fout.write((const char *) data.data(), data.size());
  14737. fout.close();
  14738. }
  14739. };
  14740. auto new_ofstream = [&](int index) {
  14741. cur_split = index;
  14742. GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
  14743. std::string fname = fname_out;
  14744. if (params->keep_split) {
  14745. char split_path[PATH_MAX] = {0};
  14746. llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
  14747. fname = std::string(split_path);
  14748. }
  14749. fout = std::ofstream(fname, std::ios::binary);
  14750. fout.exceptions(std::ofstream::failbit); // fail fast on write errors
  14751. const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
  14752. // placeholder for the meta data
  14753. ::zeros(fout, meta_size);
  14754. };
  14755. const auto tn = LLM_TN(model.arch);
  14756. new_ofstream(0);
  14757. for (int i = 0; i < ml.n_tensors; ++i) {
  14758. auto weight = ml.get_weight(i);
  14759. struct ggml_tensor * tensor = weight->tensor;
  14760. if (weight->idx != cur_split && params->keep_split) {
  14761. close_ofstream();
  14762. new_ofstream(weight->idx);
  14763. }
  14764. const std::string name = ggml_get_name(tensor);
  14765. if (!ml.use_mmap) {
  14766. if (read_data.size() < ggml_nbytes(tensor)) {
  14767. read_data.resize(ggml_nbytes(tensor));
  14768. }
  14769. tensor->data = read_data.data();
  14770. }
  14771. ml.load_data_for(tensor);
  14772. LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
  14773. ++idx, ml.n_tensors,
  14774. ggml_get_name(tensor),
  14775. llama_format_tensor_shape(tensor).c_str(),
  14776. ggml_type_name(tensor->type));
  14777. // This used to be a regex, but <regex> has an extreme cost to compile times.
  14778. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
  14779. // quantize only 2D and 3D tensors (experts)
  14780. quantize &= (ggml_n_dims(tensor) >= 2);
  14781. // do not quantize norm tensors
  14782. quantize &= name.find("_norm.weight") == std::string::npos;
  14783. quantize &= params->quantize_output_tensor || name != "output.weight";
  14784. quantize &= !params->only_copy;
  14785. // do not quantize expert gating tensors
  14786. // NOTE: can't use LLM_TN here because the layer number is not known
  14787. quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
  14788. // do not quantize positional embeddings and token types (BERT)
  14789. quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
  14790. quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
  14791. // do not quantize Mamba's small yet 2D weights
  14792. // NOTE: can't use LLM_TN here because the layer number is not known
  14793. quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
  14794. // do not quantize RWKV's time_mix_first tensors
  14795. quantize &= name.find("time_mix_first.weight") == std::string::npos;
  14796. quantize &= name.find("time_mix_w1.weight") == std::string::npos;
  14797. quantize &= name.find("time_mix_w2.weight") == std::string::npos;
  14798. // do not quantize relative position bias (T5)
  14799. quantize &= name.find("attn_rel_b.weight") == std::string::npos;
  14800. enum ggml_type new_type;
  14801. void * new_data;
  14802. size_t new_size;
  14803. if (quantize) {
  14804. new_type = default_type;
  14805. // get more optimal quantization type based on the tensor shape, layer, etc.
  14806. if (!params->pure && ggml_is_quantized(default_type)) {
  14807. new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
  14808. }
  14809. if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
  14810. new_type = params->token_embedding_type;
  14811. }
  14812. if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
  14813. new_type = params->output_tensor_type;
  14814. }
  14815. // If we've decided to quantize to the same type the tensor is already
  14816. // in then there's nothing to do.
  14817. quantize = tensor->type != new_type;
  14818. }
  14819. if (!quantize) {
  14820. new_type = tensor->type;
  14821. new_data = tensor->data;
  14822. new_size = ggml_nbytes(tensor);
  14823. LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
  14824. } else {
  14825. const int64_t nelements = ggml_nelements(tensor);
  14826. const float * imatrix = nullptr;
  14827. if (imatrix_data) {
  14828. auto it = imatrix_data->find(tensor->name);
  14829. if (it == imatrix_data->end()) {
  14830. LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
  14831. } else {
  14832. if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
  14833. imatrix = it->second.data();
  14834. } else {
  14835. LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
  14836. int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
  14837. // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
  14838. // this is a significant error and it may be good idea to abort the process if this happens,
  14839. // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
  14840. // tok_embd should be ignored in this case, since it always causes this warning
  14841. if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
  14842. throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
  14843. int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
  14844. }
  14845. }
  14846. }
  14847. }
  14848. if ((new_type == GGML_TYPE_IQ2_XXS ||
  14849. new_type == GGML_TYPE_IQ2_XS ||
  14850. new_type == GGML_TYPE_IQ2_S ||
  14851. new_type == GGML_TYPE_IQ1_S ||
  14852. (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
  14853. (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
  14854. LLAMA_LOG_ERROR("\n\n============================================================\n");
  14855. LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
  14856. LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
  14857. LLAMA_LOG_ERROR("============================================================\n\n");
  14858. throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
  14859. }
  14860. float * f32_data;
  14861. if (tensor->type == GGML_TYPE_F32) {
  14862. f32_data = (float *) tensor->data;
  14863. } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
  14864. throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
  14865. } else {
  14866. llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread);
  14867. f32_data = (float *) f32_conv_buf.data();
  14868. }
  14869. int chunk_size_multiplier = 1;
  14870. if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
  14871. if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
  14872. else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
  14873. if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
  14874. else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
  14875. }
  14876. LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
  14877. fflush(stdout);
  14878. if (work.size() < (size_t)nelements * 4) {
  14879. work.resize(nelements * 4); // upper bound on size
  14880. }
  14881. new_data = work.data();
  14882. const int64_t n_per_row = tensor->ne[0];
  14883. const int64_t nrows = tensor->ne[1];
  14884. static const int64_t min_chunk_size = 32 * 512;
  14885. const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
  14886. chunk_size_multiplier;
  14887. const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
  14888. const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
  14889. const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
  14890. // quantize each expert separately since they have different importance matrices
  14891. new_size = 0;
  14892. for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
  14893. const float * f32_data_03 = f32_data + i03 * nelements_matrix;
  14894. void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
  14895. const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
  14896. new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
  14897. }
  14898. LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
  14899. }
  14900. total_size_org += ggml_nbytes(tensor);
  14901. total_size_new += new_size;
  14902. // update the gguf meta data as we go
  14903. gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
  14904. gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
  14905. // write tensor data + padding
  14906. fout.write((const char *) new_data, new_size);
  14907. zeros(fout, GGML_PAD(new_size, align) - new_size);
  14908. }
  14909. close_ofstream();
  14910. for (auto & c:ctx_outs) {
  14911. gguf_free(c);
  14912. }
  14913. LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  14914. LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
  14915. if (qs.n_fallback > 0) {
  14916. LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
  14917. __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
  14918. }
  14919. }
  14920. static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
  14921. LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
  14922. ggml_context * ctx = nullptr;
  14923. struct gguf_init_params meta_gguf_params = {
  14924. /* .no_alloc = */ true,
  14925. /* .ctx = */ &ctx,
  14926. };
  14927. struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
  14928. if (!ctx_gguf) {
  14929. throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
  14930. }
  14931. // check metadata
  14932. {
  14933. auto get_kv_str = [&](const std::string & key) -> std::string {
  14934. int id = gguf_find_key(ctx_gguf, key.c_str());
  14935. return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
  14936. };
  14937. auto get_kv_f32 = [&](const std::string & key) -> float {
  14938. int id = gguf_find_key(ctx_gguf, key.c_str());
  14939. return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
  14940. };
  14941. LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
  14942. auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
  14943. if (general_type != "adapter") {
  14944. gguf_free(ctx_gguf);
  14945. throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
  14946. }
  14947. auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
  14948. auto general_arch = llm_arch_from_string(general_arch_str);
  14949. if (general_arch != model->arch) {
  14950. gguf_free(ctx_gguf);
  14951. throw std::runtime_error("model arch and LoRA arch mismatch");
  14952. }
  14953. auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
  14954. if (adapter_type != "lora") {
  14955. gguf_free(ctx_gguf);
  14956. throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
  14957. }
  14958. adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
  14959. }
  14960. int n_tensors = gguf_get_n_tensors(ctx_gguf);
  14961. // contexts for each buffer type
  14962. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  14963. auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
  14964. auto it = ctx_map.find(buft);
  14965. if (it == ctx_map.end()) {
  14966. // add a new context
  14967. struct ggml_init_params params = {
  14968. /*.mem_size =*/ n_tensors*ggml_tensor_overhead(),
  14969. /*.mem_buffer =*/ NULL,
  14970. /*.no_alloc =*/ true,
  14971. };
  14972. ggml_context * buft_ctx = ggml_init(params);
  14973. ctx_map[buft] = buft_ctx;
  14974. return buft_ctx;
  14975. };
  14976. return it->second;
  14977. };
  14978. // bundle lora_a and lora_b into pairs
  14979. std::map<std::string, llama_lora_weight> ab_map;
  14980. auto str_endswith = [](const std::string & str, const std::string & suffix) {
  14981. return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
  14982. };
  14983. for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
  14984. std::string name(cur->name);
  14985. if (str_endswith(name, ".lora_a")) {
  14986. replace_all(name, ".lora_a", "");
  14987. if (ab_map.find(name) == ab_map.end()) {
  14988. ab_map[name] = llama_lora_weight(cur, nullptr);
  14989. } else {
  14990. ab_map[name].a = cur;
  14991. }
  14992. } else if (str_endswith(name, ".lora_b")) {
  14993. replace_all(name, ".lora_b", "");
  14994. if (ab_map.find(name) == ab_map.end()) {
  14995. ab_map[name] = llama_lora_weight(nullptr, cur);
  14996. } else {
  14997. ab_map[name].b = cur;
  14998. }
  14999. } else {
  15000. gguf_free(ctx_gguf);
  15001. ggml_free(ctx);
  15002. throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
  15003. }
  15004. }
  15005. // add tensors
  15006. for (auto & it : ab_map) {
  15007. const std::string & name = it.first;
  15008. llama_lora_weight & w = it.second;
  15009. if (!w.a || !w.b) {
  15010. gguf_free(ctx_gguf);
  15011. ggml_free(ctx);
  15012. throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
  15013. }
  15014. // device buft and device ctx
  15015. auto * model_tensor = llama_get_model_tensor(model, name.c_str());
  15016. if (!model_tensor) {
  15017. gguf_free(ctx_gguf);
  15018. ggml_free(ctx);
  15019. throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
  15020. }
  15021. struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
  15022. // validate tensor shape
  15023. if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
  15024. gguf_free(ctx_gguf);
  15025. ggml_free(ctx);
  15026. throw std::runtime_error("tensor '" + name + "' has incorrect shape");
  15027. }
  15028. if (w.a->ne[1] != w.b->ne[0]) {
  15029. gguf_free(ctx_gguf);
  15030. ggml_free(ctx);
  15031. throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
  15032. }
  15033. // save tensor to adapter
  15034. struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
  15035. struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
  15036. ggml_set_name(tensor_a, w.a->name);
  15037. ggml_set_name(tensor_b, w.b->name);
  15038. adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
  15039. }
  15040. // allocate tensors / buffers and zero
  15041. {
  15042. adapter.ctxs.reserve(ctx_map.size());
  15043. adapter.bufs.reserve(ctx_map.size());
  15044. for (auto it : ctx_map) {
  15045. ggml_backend_buffer_type_t buft = it.first;
  15046. ggml_context * ctx_dev = it.second;
  15047. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
  15048. if (!buf) {
  15049. gguf_free(ctx_gguf);
  15050. ggml_free(ctx);
  15051. throw std::runtime_error("failed to allocate buffer for lora adapter\n");
  15052. }
  15053. LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
  15054. adapter.ctxs.push_back(ctx_dev);
  15055. adapter.bufs.push_back(buf);
  15056. }
  15057. }
  15058. // set tensor data
  15059. {
  15060. llama_file gguf_file(path_lora, "rb");
  15061. std::vector<uint8_t> read_buf;
  15062. auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
  15063. size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
  15064. size_t size = ggml_nbytes(orig);
  15065. read_buf.resize(size);
  15066. gguf_file.seek(offs, SEEK_SET);
  15067. gguf_file.read_raw(read_buf.data(), size);
  15068. ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
  15069. };
  15070. for (auto & it : adapter.ab_map) {
  15071. auto orig = ab_map[it.first];
  15072. auto dev = it.second;
  15073. set_tensor(orig.a, dev.a);
  15074. set_tensor(orig.b, dev.b);
  15075. }
  15076. }
  15077. LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
  15078. // free ctx for reading gguf
  15079. gguf_free(ctx_gguf);
  15080. ggml_free(ctx);
  15081. }
  15082. int32_t llama_lora_adapter_set(
  15083. struct llama_context * ctx,
  15084. struct llama_lora_adapter * adapter,
  15085. float scale) {
  15086. if (ctx->cparams.flash_attn) {
  15087. LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
  15088. return -1;
  15089. }
  15090. ctx->lora_adapters[adapter] = scale;
  15091. return 0;
  15092. }
  15093. int32_t llama_lora_adapter_remove(
  15094. struct llama_context * ctx,
  15095. struct llama_lora_adapter * adapter) {
  15096. auto pos = ctx->lora_adapters.find(adapter);
  15097. if (pos != ctx->lora_adapters.end()) {
  15098. ctx->lora_adapters.erase(pos);
  15099. return 0;
  15100. }
  15101. return -1;
  15102. }
  15103. void llama_lora_adapter_clear(struct llama_context * ctx) {
  15104. ctx->lora_adapters.clear();
  15105. }
  15106. void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
  15107. delete adapter;
  15108. }
  15109. //
  15110. // interface implementation
  15111. //
  15112. struct llama_model_params llama_model_default_params() {
  15113. struct llama_model_params result = {
  15114. /*.n_gpu_layers =*/ 0,
  15115. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  15116. /*.main_gpu =*/ 0,
  15117. /*.tensor_split =*/ nullptr,
  15118. /*.rpc_servers =*/ nullptr,
  15119. /*.progress_callback =*/ nullptr,
  15120. /*.progress_callback_user_data =*/ nullptr,
  15121. /*.kv_overrides =*/ nullptr,
  15122. /*.vocab_only =*/ false,
  15123. /*.use_mmap =*/ true,
  15124. /*.use_mlock =*/ false,
  15125. /*.check_tensors =*/ false,
  15126. };
  15127. #ifdef GGML_USE_METAL
  15128. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  15129. result.n_gpu_layers = 999;
  15130. #endif
  15131. return result;
  15132. }
  15133. struct llama_context_params llama_context_default_params() {
  15134. struct llama_context_params result = {
  15135. /*.seed =*/ LLAMA_DEFAULT_SEED,
  15136. /*.n_ctx =*/ 512,
  15137. /*.n_batch =*/ 2048,
  15138. /*.n_ubatch =*/ 512,
  15139. /*.n_seq_max =*/ 1,
  15140. /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
  15141. /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
  15142. /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
  15143. /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
  15144. /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
  15145. /*.rope_freq_base =*/ 0.0f,
  15146. /*.rope_freq_scale =*/ 0.0f,
  15147. /*.yarn_ext_factor =*/ -1.0f,
  15148. /*.yarn_attn_factor =*/ 1.0f,
  15149. /*.yarn_beta_fast =*/ 32.0f,
  15150. /*.yarn_beta_slow =*/ 1.0f,
  15151. /*.yarn_orig_ctx =*/ 0,
  15152. /*.defrag_thold =*/ -1.0f,
  15153. /*.cb_eval =*/ nullptr,
  15154. /*.cb_eval_user_data =*/ nullptr,
  15155. /*.type_k =*/ GGML_TYPE_F16,
  15156. /*.type_v =*/ GGML_TYPE_F16,
  15157. /*.logits_all =*/ false,
  15158. /*.embeddings =*/ false,
  15159. /*.offload_kqv =*/ true,
  15160. /*.flash_attn =*/ false,
  15161. /*.abort_callback =*/ nullptr,
  15162. /*.abort_callback_data =*/ nullptr,
  15163. };
  15164. return result;
  15165. }
  15166. struct llama_model_quantize_params llama_model_quantize_default_params() {
  15167. struct llama_model_quantize_params result = {
  15168. /*.nthread =*/ 0,
  15169. /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
  15170. /*.output_tensor_type =*/ GGML_TYPE_COUNT,
  15171. /*.token_embedding_type =*/ GGML_TYPE_COUNT,
  15172. /*.allow_requantize =*/ false,
  15173. /*.quantize_output_tensor =*/ true,
  15174. /*.only_copy =*/ false,
  15175. /*.pure =*/ false,
  15176. /*.keep_split =*/ false,
  15177. /*.imatrix =*/ nullptr,
  15178. /*.kv_overrides =*/ nullptr,
  15179. };
  15180. return result;
  15181. }
  15182. size_t llama_max_devices(void) {
  15183. #if defined(GGML_USE_RPC)
  15184. return GGML_RPC_MAX_SERVERS;
  15185. #elif defined(GGML_USE_METAL)
  15186. return 1;
  15187. #elif defined(GGML_USE_CUDA)
  15188. return GGML_CUDA_MAX_DEVICES;
  15189. #elif defined(GGML_USE_SYCL)
  15190. return GGML_SYCL_MAX_DEVICES;
  15191. #elif defined(GGML_USE_VULKAN)
  15192. return GGML_VK_MAX_DEVICES;
  15193. #elif defined(GGML_USE_CANN)
  15194. return GGML_CANN_MAX_DEVICES;
  15195. #else
  15196. return 1;
  15197. #endif
  15198. }
  15199. bool llama_supports_mmap(void) {
  15200. return llama_mmap::SUPPORTED;
  15201. }
  15202. bool llama_supports_mlock(void) {
  15203. return llama_mlock::SUPPORTED;
  15204. }
  15205. bool llama_supports_gpu_offload(void) {
  15206. #if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
  15207. defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
  15208. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
  15209. return true;
  15210. #else
  15211. return false;
  15212. #endif
  15213. }
  15214. void llama_backend_init(void) {
  15215. ggml_time_init();
  15216. // needed to initialize f16 tables
  15217. {
  15218. struct ggml_init_params params = { 0, NULL, false };
  15219. struct ggml_context * ctx = ggml_init(params);
  15220. ggml_free(ctx);
  15221. }
  15222. }
  15223. void llama_numa_init(enum ggml_numa_strategy numa) {
  15224. if (numa != GGML_NUMA_STRATEGY_DISABLED) {
  15225. ggml_numa_init(numa);
  15226. }
  15227. }
  15228. void llama_attach_threadpool(
  15229. struct llama_context * ctx,
  15230. ggml_threadpool_t threadpool,
  15231. ggml_threadpool_t threadpool_batch) {
  15232. ctx->threadpool = threadpool;
  15233. ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
  15234. }
  15235. void llama_detach_threadpool(struct llama_context * ctx) {
  15236. ctx->threadpool = nullptr;
  15237. ctx->threadpool_batch = nullptr;
  15238. }
  15239. void llama_backend_free(void) {
  15240. ggml_quantize_free();
  15241. }
  15242. int64_t llama_time_us(void) {
  15243. return ggml_time_us();
  15244. }
  15245. struct llama_model * llama_load_model_from_file(
  15246. const char * path_model,
  15247. struct llama_model_params params) {
  15248. ggml_time_init();
  15249. llama_model * model = new llama_model;
  15250. unsigned cur_percentage = 0;
  15251. if (params.progress_callback == NULL) {
  15252. params.progress_callback_user_data = &cur_percentage;
  15253. params.progress_callback = [](float progress, void * ctx) {
  15254. unsigned * cur_percentage_p = (unsigned *) ctx;
  15255. unsigned percentage = (unsigned) (100 * progress);
  15256. while (percentage > *cur_percentage_p) {
  15257. *cur_percentage_p = percentage;
  15258. LLAMA_LOG_INFO(".");
  15259. if (percentage >= 100) {
  15260. LLAMA_LOG_INFO("\n");
  15261. }
  15262. }
  15263. return true;
  15264. };
  15265. }
  15266. if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
  15267. // split the servers set them into model->rpc_servers
  15268. std::string servers(params.rpc_servers);
  15269. size_t pos = 0;
  15270. while ((pos = servers.find(",")) != std::string::npos) {
  15271. std::string server = servers.substr(0, pos);
  15272. model->rpc_servers.push_back(server);
  15273. servers.erase(0, pos + 1);
  15274. }
  15275. model->rpc_servers.push_back(servers);
  15276. }
  15277. int status = llama_model_load(path_model, *model, params);
  15278. GGML_ASSERT(status <= 0);
  15279. if (status < 0) {
  15280. if (status == -1) {
  15281. LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
  15282. } else if (status == -2) {
  15283. LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
  15284. }
  15285. delete model;
  15286. return nullptr;
  15287. }
  15288. return model;
  15289. }
  15290. void llama_free_model(struct llama_model * model) {
  15291. delete model;
  15292. }
  15293. struct llama_context * llama_new_context_with_model(
  15294. struct llama_model * model,
  15295. struct llama_context_params params) {
  15296. if (!model) {
  15297. LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
  15298. return nullptr;
  15299. }
  15300. if (params.n_batch == 0 && params.n_ubatch == 0) {
  15301. LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
  15302. return nullptr;
  15303. }
  15304. if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
  15305. LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
  15306. return nullptr;
  15307. }
  15308. if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
  15309. LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
  15310. params.flash_attn = false;
  15311. }
  15312. if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
  15313. LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
  15314. params.flash_attn = false;
  15315. }
  15316. if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
  15317. LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
  15318. return nullptr;
  15319. }
  15320. llama_context * ctx = new llama_context(*model);
  15321. const auto & hparams = model->hparams;
  15322. auto & cparams = ctx->cparams;
  15323. cparams.n_seq_max = std::max(1u, params.n_seq_max);
  15324. cparams.n_threads = params.n_threads;
  15325. cparams.n_threads_batch = params.n_threads_batch;
  15326. cparams.yarn_ext_factor = params.yarn_ext_factor;
  15327. cparams.yarn_attn_factor = params.yarn_attn_factor;
  15328. cparams.yarn_beta_fast = params.yarn_beta_fast;
  15329. cparams.yarn_beta_slow = params.yarn_beta_slow;
  15330. cparams.defrag_thold = params.defrag_thold;
  15331. cparams.embeddings = params.embeddings;
  15332. cparams.offload_kqv = params.offload_kqv;
  15333. cparams.flash_attn = params.flash_attn;
  15334. cparams.pooling_type = params.pooling_type;
  15335. cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
  15336. cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
  15337. cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
  15338. // this is necessary due to kv_self.n being padded later during inference
  15339. cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
  15340. // with causal attention, the batch size is limited by the context size
  15341. cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  15342. // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
  15343. // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
  15344. // ref: https://github.com/ggerganov/llama.cpp/pull/5021
  15345. if (cparams.n_batch < GGML_KQ_MASK_PAD) {
  15346. LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
  15347. cparams.n_batch = GGML_KQ_MASK_PAD;
  15348. }
  15349. cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
  15350. cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
  15351. hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
  15352. hparams.n_ctx_train;
  15353. cparams.cb_eval = params.cb_eval;
  15354. cparams.cb_eval_user_data = params.cb_eval_user_data;
  15355. auto rope_scaling_type = params.rope_scaling_type;
  15356. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
  15357. rope_scaling_type = hparams.rope_scaling_type_train;
  15358. }
  15359. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
  15360. cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
  15361. }
  15362. if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
  15363. cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
  15364. }
  15365. cparams.yarn_attn_factor *= hparams.rope_attn_factor;
  15366. if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  15367. if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  15368. cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
  15369. } else {
  15370. cparams.pooling_type = hparams.pooling_type;
  15371. }
  15372. }
  15373. if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
  15374. cparams.causal_attn = hparams.causal_attn;
  15375. } else {
  15376. cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
  15377. }
  15378. if (params.seed == LLAMA_DEFAULT_SEED) {
  15379. params.seed = time(NULL);
  15380. }
  15381. LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
  15382. LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
  15383. LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
  15384. LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
  15385. LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
  15386. LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
  15387. ctx->abort_callback = params.abort_callback;
  15388. ctx->abort_callback_data = params.abort_callback_data;
  15389. ctx->sampling.rng = std::mt19937(params.seed);
  15390. ctx->logits_all = params.logits_all;
  15391. // build worst-case graph for encoder if a model contains encoder
  15392. ctx->is_encoding = llama_model_has_encoder(model);
  15393. uint32_t kv_size = cparams.n_ctx;
  15394. ggml_type type_k = params.type_k;
  15395. ggml_type type_v = params.type_v;
  15396. // Mamba only needs a constant number of KV cache cells per sequence
  15397. if (llama_model_is_recurrent(model)) {
  15398. // Mamba needs at least as many KV cells as there are sequences kept at any time
  15399. kv_size = std::max((uint32_t) 1, params.n_seq_max);
  15400. // it's probably best to keep as much precision as possible for the states
  15401. type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
  15402. type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
  15403. }
  15404. GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
  15405. GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
  15406. if (!hparams.vocab_only) {
  15407. // initialize backends
  15408. #if defined(GGML_USE_METAL)
  15409. if (model->n_gpu_layers > 0) {
  15410. ctx->backend_metal = ggml_backend_metal_init();
  15411. if (ctx->backend_metal == nullptr) {
  15412. LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
  15413. llama_free(ctx);
  15414. return nullptr;
  15415. }
  15416. ctx->backends.push_back(ctx->backend_metal);
  15417. }
  15418. #elif defined(GGML_USE_CUDA)
  15419. if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
  15420. // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
  15421. ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
  15422. if (backend == nullptr) {
  15423. LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
  15424. llama_free(ctx);
  15425. return nullptr;
  15426. }
  15427. ctx->backends.push_back(backend);
  15428. } else {
  15429. // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
  15430. for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
  15431. ggml_backend_t backend = ggml_backend_cuda_init(device);
  15432. if (backend == nullptr) {
  15433. LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
  15434. llama_free(ctx);
  15435. return nullptr;
  15436. }
  15437. ctx->backends.push_back(backend);
  15438. }
  15439. }
  15440. #elif defined(GGML_USE_VULKAN)
  15441. if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
  15442. LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
  15443. llama_free(ctx);
  15444. return nullptr;
  15445. }
  15446. if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
  15447. ggml_backend_t backend = ggml_backend_vk_init(model->main_gpu);
  15448. if (backend == nullptr) {
  15449. LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
  15450. llama_free(ctx);
  15451. return nullptr;
  15452. }
  15453. ctx->backends.push_back(backend);
  15454. } else {
  15455. for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
  15456. ggml_backend_t backend = ggml_backend_vk_init(device);
  15457. if (backend == nullptr) {
  15458. LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
  15459. llama_free(ctx);
  15460. return nullptr;
  15461. }
  15462. ctx->backends.push_back(backend);
  15463. }
  15464. }
  15465. #elif defined(GGML_USE_SYCL)
  15466. // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
  15467. if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
  15468. ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
  15469. if (backend == nullptr) {
  15470. LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
  15471. llama_free(ctx);
  15472. return nullptr;
  15473. }
  15474. ctx->backends.push_back(backend);
  15475. } else {
  15476. // LLAMA_SPLIT_LAYER requires a backend for each GPU
  15477. for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
  15478. ggml_backend_t backend = ggml_backend_sycl_init(i);
  15479. if (backend == nullptr) {
  15480. LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
  15481. llama_free(ctx);
  15482. return nullptr;
  15483. }
  15484. ctx->backends.push_back(backend);
  15485. }
  15486. }
  15487. #elif defined(GGML_USE_KOMPUTE)
  15488. if (model->n_gpu_layers > 0) {
  15489. auto * backend = ggml_backend_kompute_init(model->main_gpu);
  15490. if (backend == nullptr) {
  15491. LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
  15492. llama_free(ctx);
  15493. return nullptr;
  15494. }
  15495. ctx->backends.push_back(backend);
  15496. }
  15497. #elif defined(GGML_USE_CANN)
  15498. // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
  15499. // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
  15500. if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
  15501. ggml_backend_t backend = ggml_backend_cann_init(model->main_gpu);
  15502. if (backend == nullptr) {
  15503. LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, model->main_gpu);
  15504. llama_free(ctx);
  15505. return nullptr;
  15506. }
  15507. ctx->backends.push_back(backend);
  15508. } else {
  15509. // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
  15510. // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
  15511. for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
  15512. ggml_backend_t backend = ggml_backend_cann_init(device);
  15513. if (backend == nullptr) {
  15514. LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
  15515. llama_free(ctx);
  15516. return nullptr;
  15517. }
  15518. ctx->backends.push_back(backend);
  15519. }
  15520. }
  15521. #endif
  15522. #ifdef GGML_USE_BLAS
  15523. ctx->backend_blas = ggml_backend_blas_init();
  15524. if (ctx->backend_blas == nullptr) {
  15525. LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
  15526. } else {
  15527. ctx->backends.push_back(ctx->backend_blas);
  15528. }
  15529. #endif
  15530. #if defined(GGML_USE_RPC)
  15531. if (model->n_gpu_layers > 0) {
  15532. for (const auto & endpoint : model->rpc_servers) {
  15533. ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
  15534. if (backend == nullptr) {
  15535. LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
  15536. llama_free(ctx);
  15537. return nullptr;
  15538. }
  15539. ctx->backends.push_back(backend);
  15540. }
  15541. }
  15542. #endif
  15543. ctx->backend_cpu = ggml_backend_cpu_init();
  15544. if (ctx->backend_cpu == nullptr) {
  15545. LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
  15546. llama_free(ctx);
  15547. return nullptr;
  15548. }
  15549. ctx->backends.push_back(ctx->backend_cpu);
  15550. if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
  15551. LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
  15552. llama_free(ctx);
  15553. return nullptr;
  15554. }
  15555. {
  15556. size_t memory_size_k = 0;
  15557. size_t memory_size_v = 0;
  15558. for (auto & k : ctx->kv_self.k_l) {
  15559. memory_size_k += ggml_nbytes(k);
  15560. }
  15561. for (auto & v : ctx->kv_self.v_l) {
  15562. memory_size_v += ggml_nbytes(v);
  15563. }
  15564. LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
  15565. (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
  15566. ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
  15567. ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
  15568. }
  15569. // graph outputs buffer
  15570. {
  15571. // resized during inference when a batch uses more outputs
  15572. if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
  15573. LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
  15574. llama_free(ctx);
  15575. return nullptr;
  15576. }
  15577. LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
  15578. ggml_backend_buffer_name(ctx->buf_output),
  15579. ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
  15580. }
  15581. // scheduler and compute buffers
  15582. {
  15583. // buffer types used for the compute buffer of each backend
  15584. std::vector<ggml_backend_buffer_type_t> backend_buft;
  15585. for (auto * backend : ctx->backends) {
  15586. if (ggml_backend_is_cpu(backend)) {
  15587. // use host buffers for the CPU backend compute buffer
  15588. backend_buft.push_back(llama_default_buffer_type_cpu(true));
  15589. } else {
  15590. backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
  15591. }
  15592. }
  15593. const size_t max_nodes = llama_model_max_nodes(*model);
  15594. // buffer used to store the computation graph and the tensor meta data
  15595. ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
  15596. // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
  15597. bool pipeline_parallel =
  15598. llama_get_device_count(*model) > 1 &&
  15599. model->n_gpu_layers > (int)model->hparams.n_layer &&
  15600. model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
  15601. params.offload_kqv;
  15602. #ifndef GGML_USE_CUDA
  15603. // pipeline parallelism requires support for async compute and events
  15604. // currently this is only implemented in the CUDA backend
  15605. pipeline_parallel = false;
  15606. #endif
  15607. ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
  15608. if (pipeline_parallel) {
  15609. LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
  15610. }
  15611. // build worst-case graph
  15612. uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  15613. uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  15614. llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  15615. llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  15616. ggml_cgraph * gf = llama_build_graph(*ctx, ubatch, true);
  15617. // initialize scheduler with the worst-case graph
  15618. if (!ggml_backend_sched_reserve(ctx->sched, gf)) {
  15619. LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
  15620. llama_free(ctx);
  15621. return nullptr;
  15622. }
  15623. for (size_t i = 0; i < ctx->backends.size(); i++) {
  15624. ggml_backend_t backend = ctx->backends[i];
  15625. ggml_backend_buffer_type_t buft = backend_buft[i];
  15626. size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
  15627. if (size > 1) {
  15628. LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
  15629. ggml_backend_buft_name(buft),
  15630. size / 1024.0 / 1024.0);
  15631. }
  15632. }
  15633. // note: the number of splits during measure is higher than during inference due to the kv shift
  15634. int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
  15635. LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
  15636. LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
  15637. }
  15638. }
  15639. return ctx;
  15640. }
  15641. void llama_free(struct llama_context * ctx) {
  15642. delete ctx;
  15643. }
  15644. const struct llama_model * llama_get_model(const struct llama_context * ctx) {
  15645. return &ctx->model;
  15646. }
  15647. const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
  15648. return &ctx->model.vocab;
  15649. }
  15650. uint32_t llama_n_ctx(const struct llama_context * ctx) {
  15651. return ctx->cparams.n_ctx;
  15652. }
  15653. uint32_t llama_n_batch(const struct llama_context * ctx) {
  15654. return ctx->cparams.n_batch;
  15655. }
  15656. uint32_t llama_n_ubatch(const struct llama_context * ctx) {
  15657. return ctx->cparams.n_ubatch;
  15658. }
  15659. uint32_t llama_n_seq_max(const struct llama_context * ctx) {
  15660. return ctx->kv_self.size;
  15661. }
  15662. enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
  15663. return model->vocab.type;
  15664. }
  15665. enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  15666. switch (model->arch) {
  15667. // these models do not use RoPE
  15668. case LLM_ARCH_GPT2:
  15669. case LLM_ARCH_GPTJ:
  15670. case LLM_ARCH_MPT:
  15671. case LLM_ARCH_REFACT:
  15672. case LLM_ARCH_BLOOM:
  15673. case LLM_ARCH_MAMBA:
  15674. case LLM_ARCH_JINA_BERT_V2:
  15675. case LLM_ARCH_T5:
  15676. case LLM_ARCH_T5ENCODER:
  15677. case LLM_ARCH_JAIS:
  15678. case LLM_ARCH_RWKV6:
  15679. return LLAMA_ROPE_TYPE_NONE;
  15680. // use what we call a normal RoPE, operating on pairs of consecutive head values
  15681. case LLM_ARCH_LLAMA:
  15682. case LLM_ARCH_BAICHUAN:
  15683. case LLM_ARCH_STARCODER:
  15684. case LLM_ARCH_PLAMO:
  15685. case LLM_ARCH_ORION:
  15686. case LLM_ARCH_INTERNLM2:
  15687. case LLM_ARCH_MINICPM:
  15688. case LLM_ARCH_XVERSE:
  15689. case LLM_ARCH_COMMAND_R:
  15690. case LLM_ARCH_OLMO:
  15691. case LLM_ARCH_ARCTIC:
  15692. case LLM_ARCH_DEEPSEEK2:
  15693. case LLM_ARCH_CHATGLM:
  15694. case LLM_ARCH_SOLAR:
  15695. return LLAMA_ROPE_TYPE_NORM;
  15696. // the pairs of head values are offset by n_rot/2
  15697. case LLM_ARCH_FALCON:
  15698. case LLM_ARCH_GROK:
  15699. case LLM_ARCH_DBRX:
  15700. case LLM_ARCH_BERT:
  15701. case LLM_ARCH_NOMIC_BERT:
  15702. case LLM_ARCH_STABLELM:
  15703. case LLM_ARCH_BITNET:
  15704. case LLM_ARCH_QWEN:
  15705. case LLM_ARCH_QWEN2:
  15706. case LLM_ARCH_QWEN2MOE:
  15707. case LLM_ARCH_PHI2:
  15708. case LLM_ARCH_PHI3:
  15709. case LLM_ARCH_GEMMA:
  15710. case LLM_ARCH_GEMMA2:
  15711. case LLM_ARCH_STARCODER2:
  15712. case LLM_ARCH_OPENELM:
  15713. case LLM_ARCH_GPTNEOX:
  15714. case LLM_ARCH_CODESHELL:
  15715. case LLM_ARCH_NEMOTRON:
  15716. case LLM_ARCH_EXAONE:
  15717. return LLAMA_ROPE_TYPE_NEOX;
  15718. // all model arches should be listed explicitly here
  15719. case LLM_ARCH_UNKNOWN:
  15720. GGML_ABORT("unknown architecture");
  15721. }
  15722. return LLAMA_ROPE_TYPE_NONE;
  15723. }
  15724. enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
  15725. return ctx->cparams.pooling_type;
  15726. }
  15727. int32_t llama_n_vocab(const struct llama_model * model) {
  15728. return model->hparams.n_vocab;
  15729. }
  15730. int32_t llama_n_ctx_train(const struct llama_model * model) {
  15731. return model->hparams.n_ctx_train;
  15732. }
  15733. int32_t llama_n_embd(const struct llama_model * model) {
  15734. return model->hparams.n_embd;
  15735. }
  15736. int32_t llama_n_layer(const struct llama_model * model) {
  15737. return model->hparams.n_layer;
  15738. }
  15739. float llama_rope_freq_scale_train(const struct llama_model * model) {
  15740. return model->hparams.rope_freq_scale_train;
  15741. }
  15742. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  15743. const auto & it = model->gguf_kv.find(key);
  15744. if (it == model->gguf_kv.end()) {
  15745. if (buf_size > 0) {
  15746. buf[0] = '\0';
  15747. }
  15748. return -1;
  15749. }
  15750. return snprintf(buf, buf_size, "%s", it->second.c_str());
  15751. }
  15752. int32_t llama_model_meta_count(const struct llama_model * model) {
  15753. return (int)model->gguf_kv.size();
  15754. }
  15755. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  15756. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  15757. if (buf_size > 0) {
  15758. buf[0] = '\0';
  15759. }
  15760. return -1;
  15761. }
  15762. auto it = model->gguf_kv.begin();
  15763. std::advance(it, i);
  15764. return snprintf(buf, buf_size, "%s", it->first.c_str());
  15765. }
  15766. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  15767. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  15768. if (buf_size > 0) {
  15769. buf[0] = '\0';
  15770. }
  15771. return -1;
  15772. }
  15773. auto it = model->gguf_kv.begin();
  15774. std::advance(it, i);
  15775. return snprintf(buf, buf_size, "%s", it->second.c_str());
  15776. }
  15777. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  15778. return snprintf(buf, buf_size, "%s %s %s",
  15779. llama_model_arch_name(model->arch),
  15780. llama_model_type_name(model->type),
  15781. llama_model_ftype_name(model->ftype).c_str());
  15782. }
  15783. uint64_t llama_model_size(const struct llama_model * model) {
  15784. uint64_t size = 0;
  15785. for (const auto & it : model->tensors_by_name) {
  15786. size += ggml_nbytes(it.second);
  15787. }
  15788. return size;
  15789. }
  15790. uint64_t llama_model_n_params(const struct llama_model * model) {
  15791. uint64_t nparams = 0;
  15792. for (const auto & it : model->tensors_by_name) {
  15793. nparams += ggml_nelements(it.second);
  15794. }
  15795. return nparams;
  15796. }
  15797. struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
  15798. auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
  15799. [name](const std::pair<std::string, struct ggml_tensor *> & it) {
  15800. return it.first == name;
  15801. });
  15802. if (it == model->tensors_by_name.end()) {
  15803. return nullptr;
  15804. }
  15805. return it->second;
  15806. }
  15807. bool llama_model_has_encoder(const struct llama_model * model) {
  15808. switch (model->arch) {
  15809. case LLM_ARCH_T5: return true;
  15810. case LLM_ARCH_T5ENCODER: return true;
  15811. default: return false;
  15812. }
  15813. }
  15814. bool llama_model_has_decoder(const struct llama_model * model) {
  15815. switch (model->arch) {
  15816. case LLM_ARCH_T5ENCODER: return false;
  15817. default: return true;
  15818. }
  15819. }
  15820. llama_token llama_model_decoder_start_token(const struct llama_model * model) {
  15821. return model->hparams.dec_start_token_id;
  15822. }
  15823. bool llama_model_is_recurrent(const struct llama_model * model) {
  15824. switch (model->arch) {
  15825. case LLM_ARCH_MAMBA: return true;
  15826. case LLM_ARCH_RWKV6: return true;
  15827. default: return false;
  15828. }
  15829. }
  15830. uint32_t llama_model_quantize(
  15831. const char * fname_inp,
  15832. const char * fname_out,
  15833. const llama_model_quantize_params * params) {
  15834. try {
  15835. llama_model_quantize_internal(fname_inp, fname_out, params);
  15836. return 0;
  15837. } catch (const std::exception & err) {
  15838. LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
  15839. return 1;
  15840. }
  15841. }
  15842. struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
  15843. try {
  15844. struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
  15845. llama_lora_adapter_init_internal(model, path_lora, *adapter);
  15846. return adapter;
  15847. } catch (const std::exception & err) {
  15848. LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
  15849. return nullptr;
  15850. }
  15851. }
  15852. static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
  15853. GGML_ASSERT(cvec.tensors.empty());
  15854. GGML_ASSERT(cvec.ctxs.empty());
  15855. GGML_ASSERT(cvec.bufs.empty());
  15856. // count layer buffer types
  15857. std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
  15858. for (int64_t i = 0; i < model.hparams.n_layer; i++) {
  15859. buft_layer_count[model.buft_layer[i].buft]++;
  15860. }
  15861. // allocate contexts
  15862. std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
  15863. for (auto & it : buft_layer_count) {
  15864. int n_layers = it.second;
  15865. struct ggml_init_params params = {
  15866. /*.mem_size =*/ n_layers * ggml_tensor_overhead(),
  15867. /*.mem_buffer =*/ NULL,
  15868. /*.no_alloc =*/ true,
  15869. };
  15870. ggml_context * ctx = ggml_init(params);
  15871. if (!ctx) {
  15872. LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
  15873. return 1;
  15874. }
  15875. ctx_map[it.first] = ctx;
  15876. }
  15877. // make tensors
  15878. cvec.tensors.reserve(model.hparams.n_layer);
  15879. cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
  15880. for (size_t il = 1; il < model.hparams.n_layer; il++) {
  15881. struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
  15882. ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  15883. cvec.tensors.push_back(tensor);
  15884. }
  15885. // allocate tensors / buffers and zero
  15886. cvec.ctxs.reserve(ctx_map.size());
  15887. cvec.bufs.reserve(ctx_map.size());
  15888. for (auto it : ctx_map) {
  15889. ggml_backend_buffer_type_t buft = it.first;
  15890. ggml_context * ctx = it.second;
  15891. ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
  15892. if (!buf) {
  15893. LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
  15894. return false;
  15895. }
  15896. ggml_backend_buffer_clear(buf, 0);
  15897. cvec.ctxs.push_back(ctx);
  15898. cvec.bufs.push_back(buf);
  15899. }
  15900. return true;
  15901. }
  15902. int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
  15903. const llama_model & model = lctx->model;
  15904. llama_control_vector & cvec = lctx->cvec;
  15905. if (data == nullptr) {
  15906. // disable the current control vector (but leave allocated for later)
  15907. cvec.layer_start = -1;
  15908. cvec.layer_end = -1;
  15909. return 0;
  15910. }
  15911. if (n_embd != (int) model.hparams.n_embd) {
  15912. LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
  15913. return 1;
  15914. }
  15915. if (cvec.tensors.empty()) {
  15916. if (!llama_control_vector_init(cvec, model)) {
  15917. return 1;
  15918. }
  15919. }
  15920. cvec.layer_start = il_start;
  15921. cvec.layer_end = il_end;
  15922. for (size_t il = 1; il < model.hparams.n_layer; il++) {
  15923. assert(cvec.tensors[il] != nullptr);
  15924. const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
  15925. if (off + n_embd <= len) {
  15926. ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
  15927. }
  15928. }
  15929. return 0;
  15930. }
  15931. struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
  15932. struct llama_kv_cache_view result = {
  15933. /*.n_cells = */ 0,
  15934. /*.n_seq_max = */ n_seq_max,
  15935. /*.token_count = */ 0,
  15936. /*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
  15937. /*.max_contiguous = */ 0,
  15938. /*.max_contiguous_idx = */ -1,
  15939. /*.cells = */ nullptr,
  15940. /*.cells_sequences = */ nullptr,
  15941. };
  15942. return result;
  15943. }
  15944. void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
  15945. if (view->cells != nullptr) {
  15946. free(view->cells);
  15947. view->cells = nullptr;
  15948. }
  15949. if (view->cells_sequences != nullptr) {
  15950. free(view->cells_sequences);
  15951. view->cells_sequences = nullptr;
  15952. }
  15953. }
  15954. void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
  15955. if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
  15956. view->n_cells = int32_t(ctx->kv_self.size);
  15957. void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
  15958. GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
  15959. view->cells = (struct llama_kv_cache_view_cell *)p;
  15960. p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
  15961. GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
  15962. view->cells_sequences = (llama_seq_id *)p;
  15963. }
  15964. const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
  15965. llama_kv_cache_view_cell * c_curr = view->cells;
  15966. llama_seq_id * cs_curr = view->cells_sequences;
  15967. int32_t used_cells = 0;
  15968. int32_t token_count = 0;
  15969. int32_t curr_contig_idx = -1;
  15970. uint32_t max_contig = 0;
  15971. int32_t max_contig_idx = -1;
  15972. for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
  15973. const size_t curr_size = kv_cells[i].seq_id.size();
  15974. token_count += curr_size;
  15975. c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
  15976. if (curr_size > 0) {
  15977. if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
  15978. max_contig = i - curr_contig_idx;
  15979. max_contig_idx = curr_contig_idx;
  15980. }
  15981. curr_contig_idx = -1;
  15982. } else if (curr_contig_idx < 0) {
  15983. curr_contig_idx = i;
  15984. }
  15985. int seq_idx = 0;
  15986. for (const llama_seq_id it : kv_cells[i].seq_id) {
  15987. if (seq_idx >= view->n_seq_max) {
  15988. break;
  15989. }
  15990. cs_curr[seq_idx] = it;
  15991. seq_idx++;
  15992. }
  15993. if (seq_idx != 0) {
  15994. used_cells++;
  15995. }
  15996. for (; seq_idx < view->n_seq_max; seq_idx++) {
  15997. cs_curr[seq_idx] = -1;
  15998. }
  15999. }
  16000. if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
  16001. max_contig_idx = curr_contig_idx;
  16002. max_contig = kv_cells.size() - curr_contig_idx;
  16003. }
  16004. view->max_contiguous = max_contig;
  16005. view->max_contiguous_idx = max_contig_idx;
  16006. view->token_count = token_count;
  16007. view->used_cells = used_cells;
  16008. if (uint32_t(used_cells) != ctx->kv_self.used) {
  16009. LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
  16010. __func__, ctx->kv_self.used, used_cells);
  16011. }
  16012. }
  16013. int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
  16014. int result = 0;
  16015. for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
  16016. result += ctx->kv_self.cells[i].seq_id.size();
  16017. }
  16018. return result;
  16019. }
  16020. int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
  16021. return ctx->kv_self.used;
  16022. }
  16023. void llama_kv_cache_clear(struct llama_context * ctx) {
  16024. llama_kv_cache_clear(ctx->kv_self);
  16025. }
  16026. bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
  16027. return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
  16028. }
  16029. void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
  16030. if (seq_id_src == seq_id_dst) {
  16031. return;
  16032. }
  16033. llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
  16034. }
  16035. void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
  16036. llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
  16037. }
  16038. void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
  16039. if (delta == 0) {
  16040. return;
  16041. }
  16042. llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
  16043. }
  16044. void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
  16045. if (d == 1) {
  16046. return;
  16047. }
  16048. llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
  16049. }
  16050. llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
  16051. return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
  16052. }
  16053. void llama_kv_cache_defrag(struct llama_context * ctx) {
  16054. llama_kv_cache_defrag(ctx->kv_self);
  16055. }
  16056. void llama_kv_cache_update(struct llama_context * ctx) {
  16057. llama_kv_cache_update_internal(*ctx);
  16058. }
  16059. // deprecated
  16060. size_t llama_get_state_size(struct llama_context * ctx) {
  16061. return llama_state_get_size(ctx);
  16062. }
  16063. // deprecated
  16064. size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  16065. return llama_state_get_data(ctx, dst, -1);
  16066. }
  16067. // deprecated
  16068. size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
  16069. return llama_state_set_data(ctx, src, -1);
  16070. }
  16071. // deprecated
  16072. bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  16073. return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  16074. }
  16075. // deprecated
  16076. bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  16077. return llama_state_save_file(ctx, path_session, tokens, n_token_count);
  16078. }
  16079. // TODO: replace all non-fatal assertions with returned errors or exceptions
  16080. struct llama_data_write {
  16081. virtual void write(const void * src, size_t size) = 0;
  16082. virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
  16083. virtual size_t get_size_written() = 0;
  16084. virtual ~llama_data_write() = default;
  16085. void write_string(const std::string & str) {
  16086. uint32_t str_size = str.size();
  16087. write(&str_size, sizeof(str_size));
  16088. write(str.data(), str_size);
  16089. }
  16090. void write_model_info(const struct llama_context * ctx) {
  16091. std::string arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
  16092. write_string(arch_str);
  16093. // TODO: add more model-specific info which should prevent loading the session file if not identical
  16094. }
  16095. void write_rng(const std::mt19937 & rng) {
  16096. std::ostringstream rng_ss;
  16097. rng_ss << rng;
  16098. const std::string & rng_str = rng_ss.str();
  16099. write_string(rng_str);
  16100. }
  16101. void write_output_ids(struct llama_context * ctx) {
  16102. llama_output_reorder(ctx);
  16103. const uint32_t n_outputs = ctx->n_outputs;
  16104. std::vector<int32_t> output_pos;
  16105. const size_t n_batch = ctx->cparams.n_batch;
  16106. const auto & output_ids = ctx->output_ids;
  16107. GGML_ASSERT(n_outputs <= ctx->output_size);
  16108. output_pos.resize(n_outputs);
  16109. // build a more compact representation of the output ids
  16110. for (size_t i = 0; i < n_batch; ++i) {
  16111. // map an output id to a position in the batch
  16112. int32_t pos = output_ids[i];
  16113. if (pos >= 0) {
  16114. GGML_ASSERT((uint32_t) pos < n_outputs);
  16115. output_pos[pos] = i;
  16116. }
  16117. }
  16118. write(&n_outputs, sizeof(n_outputs));
  16119. if (n_outputs) {
  16120. write(output_pos.data(), n_outputs * sizeof(int32_t));
  16121. }
  16122. }
  16123. void write_logits(const struct llama_context * ctx) {
  16124. const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
  16125. write(&logits_size, sizeof(logits_size));
  16126. if (logits_size) {
  16127. write(ctx->logits, logits_size * sizeof(float));
  16128. }
  16129. }
  16130. void write_embeddings(const struct llama_context * ctx) {
  16131. const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
  16132. write(&embeddings_size, sizeof(embeddings_size));
  16133. if (embeddings_size) {
  16134. write(ctx->embd, embeddings_size * sizeof(float));
  16135. }
  16136. }
  16137. void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
  16138. for (const auto & range : cell_ranges) {
  16139. for (uint32_t i = range.first; i < range.second; ++i) {
  16140. const auto & cell = kv_self.cells[i];
  16141. const llama_pos pos = cell.pos;
  16142. const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
  16143. write(&pos, sizeof(pos));
  16144. write(&n_seq_id, sizeof(n_seq_id));
  16145. if (n_seq_id) {
  16146. for (auto seq_id : cell.seq_id) {
  16147. write(&seq_id, sizeof(seq_id));
  16148. }
  16149. }
  16150. }
  16151. }
  16152. }
  16153. void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
  16154. const struct llama_kv_cache & kv_self = ctx->kv_self;
  16155. const struct llama_hparams & hparams = ctx->model.hparams;
  16156. const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
  16157. const uint32_t n_layer = hparams.n_layer;
  16158. write(&v_trans, sizeof(v_trans));
  16159. write(&n_layer, sizeof(n_layer));
  16160. std::vector<uint8_t> tmp_buf;
  16161. // Iterate and write all the keys first, each row is a cell
  16162. // Get whole range at a time
  16163. for (uint32_t il = 0; il < n_layer; ++il) {
  16164. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  16165. // Write key type
  16166. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  16167. write(&k_type_i, sizeof(k_type_i));
  16168. // Write row size of key
  16169. const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  16170. write(&k_size_row, sizeof(k_size_row));
  16171. // Read each range of cells of k_size length each into tmp_buf and write out
  16172. for (const auto & range : cell_ranges) {
  16173. const size_t range_size = range.second - range.first;
  16174. const size_t buf_size = range_size * k_size_row;
  16175. write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
  16176. }
  16177. }
  16178. if (!kv_self.v_trans) {
  16179. for (uint32_t il = 0; il < n_layer; ++il) {
  16180. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  16181. // Write value type
  16182. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  16183. write(&v_type_i, sizeof(v_type_i));
  16184. // Write row size of value
  16185. const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  16186. write(&v_size_row, sizeof(v_size_row));
  16187. // Read each range of cells of v_size length each into tmp_buf and write out
  16188. for (const auto & range : cell_ranges) {
  16189. const size_t range_size = range.second - range.first;
  16190. const size_t buf_size = range_size * v_size_row;
  16191. write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
  16192. }
  16193. }
  16194. } else {
  16195. // When v is transposed, we also need the element size and get the element ranges from each row
  16196. const uint32_t kv_size = kv_self.size;
  16197. for (uint32_t il = 0; il < n_layer; ++il) {
  16198. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  16199. // Write value type
  16200. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  16201. write(&v_type_i, sizeof(v_type_i));
  16202. // Write element size
  16203. const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  16204. write(&v_size_el, sizeof(v_size_el));
  16205. // Write GQA embedding size
  16206. write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
  16207. // For each row, we get the element values of each cell
  16208. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  16209. // Read each range of cells of v_size_el length each into tmp_buf and write out
  16210. for (const auto & range : cell_ranges) {
  16211. const size_t range_size = range.second - range.first;
  16212. const size_t src_offset = (range.first + j * kv_size) * v_size_el;
  16213. const size_t buf_size = range_size * v_size_el;
  16214. write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
  16215. }
  16216. }
  16217. }
  16218. }
  16219. }
  16220. void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
  16221. const struct llama_kv_cache & kv_self = ctx->kv_self;
  16222. std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
  16223. uint32_t cell_count = 0;
  16224. // Count the number of cells with the specified seq_id
  16225. // Find all the ranges of cells with this seq id (or all, when -1)
  16226. uint32_t cell_range_begin = kv_self.size;
  16227. for (uint32_t i = 0; i < kv_self.size; ++i) {
  16228. const auto & cell = kv_self.cells[i];
  16229. if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
  16230. ++cell_count;
  16231. if (cell_range_begin == kv_self.size) {
  16232. cell_range_begin = i;
  16233. }
  16234. } else {
  16235. if (cell_range_begin != kv_self.size) {
  16236. cell_ranges.emplace_back(cell_range_begin, i);
  16237. cell_range_begin = kv_self.size;
  16238. }
  16239. }
  16240. }
  16241. if (cell_range_begin != kv_self.size) {
  16242. cell_ranges.emplace_back(cell_range_begin, kv_self.size);
  16243. }
  16244. // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
  16245. uint32_t cell_count_check = 0;
  16246. for (const auto & range : cell_ranges) {
  16247. cell_count_check += range.second - range.first;
  16248. }
  16249. GGML_ASSERT(cell_count == cell_count_check);
  16250. write(&cell_count, sizeof(cell_count));
  16251. write_kv_cache_meta(kv_self, cell_ranges, seq_id);
  16252. write_kv_cache_data(ctx, cell_ranges);
  16253. }
  16254. };
  16255. struct llama_data_read {
  16256. virtual const uint8_t * read(size_t size) = 0;
  16257. virtual void read_to(void * dst, size_t size) = 0;
  16258. virtual size_t get_size_read() = 0;
  16259. virtual ~llama_data_read() = default;
  16260. void read_string(std::string & str) {
  16261. uint32_t str_size;
  16262. read_to(&str_size, sizeof(str_size));
  16263. str.assign((const char *) read(str_size), str_size);
  16264. }
  16265. // validate model information
  16266. void read_model_info(const struct llama_context * ctx) {
  16267. std::string cur_arch_str = LLM_ARCH_NAMES.at(ctx->model.arch);
  16268. std::string arch_str;
  16269. read_string(arch_str);
  16270. if (cur_arch_str != arch_str) {
  16271. throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
  16272. }
  16273. // TODO: add more info which needs to be identical but which is not verified otherwise
  16274. }
  16275. void read_rng(std::mt19937 & rng) {
  16276. std::string rng_str;
  16277. read_string(rng_str);
  16278. std::istringstream rng_ss(rng_str);
  16279. rng_ss >> rng;
  16280. if (rng_ss.fail()) {
  16281. throw std::runtime_error("failed to load RNG state");
  16282. }
  16283. }
  16284. void read_output_ids(struct llama_context * ctx) {
  16285. std::vector<int32_t> output_pos;
  16286. uint32_t n_outputs;
  16287. read_to(&n_outputs, sizeof(n_outputs));
  16288. if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
  16289. throw std::runtime_error("could not reserve outputs");
  16290. }
  16291. if (n_outputs) {
  16292. output_pos.resize(n_outputs);
  16293. read_to(output_pos.data(), n_outputs * sizeof(int32_t));
  16294. for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
  16295. int32_t id = output_pos[i];
  16296. if ((uint32_t) id >= ctx->cparams.n_batch) {
  16297. throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
  16298. }
  16299. ctx->output_ids[id] = i;
  16300. }
  16301. ctx->n_outputs = n_outputs;
  16302. }
  16303. }
  16304. void read_logits(struct llama_context * ctx) {
  16305. uint64_t logits_size;
  16306. read_to(&logits_size, sizeof(logits_size));
  16307. if (ctx->logits_size < logits_size) {
  16308. throw std::runtime_error("logits buffer too small");
  16309. }
  16310. if (logits_size) {
  16311. read_to(ctx->logits, logits_size * sizeof(float));
  16312. }
  16313. }
  16314. void read_embeddings(struct llama_context * ctx) {
  16315. uint64_t embeddings_size;
  16316. read_to(&embeddings_size, sizeof(embeddings_size));
  16317. if (ctx->embd_size < embeddings_size) {
  16318. throw std::runtime_error("embeddings buffer too small");
  16319. }
  16320. if (embeddings_size) {
  16321. read_to(ctx->embd, embeddings_size * sizeof(float));
  16322. }
  16323. }
  16324. bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
  16325. struct llama_kv_cache & kv_self = ctx->kv_self;
  16326. if (dest_seq_id != -1) {
  16327. // single sequence
  16328. llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
  16329. llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
  16330. batch.n_tokens = cell_count;
  16331. batch.n_seq_tokens = cell_count;
  16332. batch.n_seqs = 1;
  16333. for (uint32_t i = 0; i < cell_count; ++i) {
  16334. llama_pos pos;
  16335. uint32_t n_seq_id;
  16336. read_to(&pos, sizeof(pos));
  16337. read_to(&n_seq_id, sizeof(n_seq_id));
  16338. if (n_seq_id != 0) {
  16339. LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
  16340. return false;
  16341. }
  16342. batch.pos[i] = pos;
  16343. }
  16344. batch.n_seq_id[0] = 1;
  16345. batch.seq_id[0] = &dest_seq_id;
  16346. if (!llama_kv_cache_find_slot(kv_self, batch)) {
  16347. LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
  16348. return false;
  16349. }
  16350. // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
  16351. // Assume that this is one contiguous block of cells
  16352. GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
  16353. GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
  16354. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
  16355. GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
  16356. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
  16357. } else {
  16358. // whole KV cache restore
  16359. if (cell_count > kv_self.size) {
  16360. LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
  16361. return false;
  16362. }
  16363. llama_kv_cache_clear(kv_self);
  16364. for (uint32_t i = 0; i < cell_count; ++i) {
  16365. llama_kv_cell & cell = kv_self.cells[i];
  16366. llama_pos pos;
  16367. uint32_t n_seq_id;
  16368. read_to(&pos, sizeof(pos));
  16369. read_to(&n_seq_id, sizeof(n_seq_id));
  16370. cell.pos = pos;
  16371. for (uint32_t j = 0; j < n_seq_id; ++j) {
  16372. llama_seq_id seq_id;
  16373. read_to(&seq_id, sizeof(seq_id));
  16374. if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
  16375. LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
  16376. return false;
  16377. }
  16378. cell.seq_id.insert(seq_id);
  16379. if (kv_self.recurrent) {
  16380. int32_t & tail = kv_self.cells[seq_id].tail;
  16381. if (tail != -1) {
  16382. LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
  16383. return false;
  16384. }
  16385. tail = i;
  16386. }
  16387. }
  16388. }
  16389. kv_self.head = 0;
  16390. kv_self.used = cell_count;
  16391. }
  16392. if (kv_self.recurrent) {
  16393. for (uint32_t i = 0; i < cell_count; ++i) {
  16394. uint32_t cell_id = kv_self.head + i;
  16395. // make sure the recurrent states will keep their restored state
  16396. kv_self.cells[cell_id].src = cell_id;
  16397. }
  16398. }
  16399. return true;
  16400. }
  16401. bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
  16402. const struct llama_hparams & hparams = ctx->model.hparams;
  16403. struct llama_kv_cache & kv_self = ctx->kv_self;
  16404. uint32_t v_trans;
  16405. uint32_t n_layer;
  16406. read_to(&v_trans, sizeof(v_trans));
  16407. read_to(&n_layer, sizeof(n_layer));
  16408. if (n_layer != hparams.n_layer) {
  16409. LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
  16410. return false;
  16411. }
  16412. if (cell_count > kv_self.size) {
  16413. LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
  16414. return false;
  16415. }
  16416. if (kv_self.v_trans != (bool) v_trans) {
  16417. LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
  16418. return false;
  16419. }
  16420. // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
  16421. for (uint32_t il = 0; il < n_layer; ++il) {
  16422. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  16423. // Read type of key
  16424. int32_t k_type_i_ref;
  16425. read_to(&k_type_i_ref, sizeof(k_type_i_ref));
  16426. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  16427. if (k_type_i != k_type_i_ref) {
  16428. LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
  16429. return false;
  16430. }
  16431. // Read row size of key
  16432. uint64_t k_size_row_ref;
  16433. read_to(&k_size_row_ref, sizeof(k_size_row_ref));
  16434. const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  16435. if (k_size_row != k_size_row_ref) {
  16436. LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
  16437. return false;
  16438. }
  16439. if (cell_count) {
  16440. // Read and set the keys for the whole cell range
  16441. ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
  16442. }
  16443. }
  16444. if (!kv_self.v_trans) {
  16445. for (uint32_t il = 0; il < n_layer; ++il) {
  16446. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  16447. // Read type of value
  16448. int32_t v_type_i_ref;
  16449. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  16450. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  16451. if (v_type_i != v_type_i_ref) {
  16452. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  16453. return false;
  16454. }
  16455. // Read row size of value
  16456. uint64_t v_size_row_ref;
  16457. read_to(&v_size_row_ref, sizeof(v_size_row_ref));
  16458. const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  16459. if (v_size_row != v_size_row_ref) {
  16460. LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
  16461. return false;
  16462. }
  16463. if (cell_count) {
  16464. // Read and set the values for the whole cell range
  16465. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
  16466. }
  16467. }
  16468. } else {
  16469. // For each layer, read the values for each cell (transposed)
  16470. for (uint32_t il = 0; il < n_layer; ++il) {
  16471. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  16472. // Read type of value
  16473. int32_t v_type_i_ref;
  16474. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  16475. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  16476. if (v_type_i != v_type_i_ref) {
  16477. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  16478. return false;
  16479. }
  16480. // Read element size of value
  16481. uint32_t v_size_el_ref;
  16482. read_to(&v_size_el_ref, sizeof(v_size_el_ref));
  16483. const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  16484. if (v_size_el != v_size_el_ref) {
  16485. LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
  16486. return false;
  16487. }
  16488. // Read GQA embedding size
  16489. uint32_t n_embd_v_gqa_ref;
  16490. read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
  16491. if (n_embd_v_gqa != n_embd_v_gqa_ref) {
  16492. LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
  16493. return false;
  16494. }
  16495. if (cell_count) {
  16496. // For each row in the transposed matrix, read the values for the whole cell range
  16497. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  16498. const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
  16499. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
  16500. }
  16501. }
  16502. }
  16503. }
  16504. return true;
  16505. }
  16506. void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
  16507. uint32_t cell_count;
  16508. read_to(&cell_count, sizeof(cell_count));
  16509. bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
  16510. if (!res) {
  16511. if (seq_id == -1) {
  16512. llama_kv_cache_clear(ctx);
  16513. } else {
  16514. llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
  16515. }
  16516. throw std::runtime_error("failed to restore kv cache");
  16517. }
  16518. }
  16519. };
  16520. struct llama_data_write_dummy : llama_data_write {
  16521. size_t size_written = 0;
  16522. llama_data_write_dummy() {}
  16523. void write(const void * /* src */, size_t size) override {
  16524. size_written += size;
  16525. }
  16526. void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
  16527. size_written += size;
  16528. }
  16529. size_t get_size_written() override {
  16530. return size_written;
  16531. }
  16532. };
  16533. struct llama_data_write_buffer : llama_data_write {
  16534. uint8_t * ptr;
  16535. size_t buf_size = 0;
  16536. size_t size_written = 0;
  16537. llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  16538. void write(const void * src, size_t size) override {
  16539. if (size > buf_size) {
  16540. throw std::runtime_error("unexpectedly reached end of buffer");
  16541. }
  16542. memcpy(ptr, src, size);
  16543. ptr += size;
  16544. size_written += size;
  16545. buf_size -= size;
  16546. }
  16547. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  16548. if (size > buf_size) {
  16549. throw std::runtime_error("unexpectedly reached end of buffer");
  16550. }
  16551. ggml_backend_tensor_get(tensor, ptr, offset, size);
  16552. ptr += size;
  16553. size_written += size;
  16554. buf_size -= size;
  16555. }
  16556. size_t get_size_written() override {
  16557. return size_written;
  16558. }
  16559. };
  16560. struct llama_data_read_buffer : llama_data_read {
  16561. const uint8_t * ptr;
  16562. size_t buf_size = 0;
  16563. size_t size_read = 0;
  16564. llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  16565. const uint8_t * read(size_t size) override {
  16566. const uint8_t * base_ptr = ptr;
  16567. if (size > buf_size) {
  16568. throw std::runtime_error("unexpectedly reached end of buffer");
  16569. }
  16570. ptr += size;
  16571. size_read += size;
  16572. buf_size -= size;
  16573. return base_ptr;
  16574. }
  16575. void read_to(void * dst, size_t size) override {
  16576. memcpy(dst, read(size), size);
  16577. }
  16578. size_t get_size_read() override {
  16579. return size_read;
  16580. }
  16581. };
  16582. struct llama_data_write_file : llama_data_write {
  16583. llama_file * file;
  16584. size_t size_written = 0;
  16585. std::vector<uint8_t> temp_buffer;
  16586. llama_data_write_file(llama_file * f) : file(f) {}
  16587. void write(const void * src, size_t size) override {
  16588. file->write_raw(src, size);
  16589. size_written += size;
  16590. }
  16591. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  16592. temp_buffer.resize(size);
  16593. ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
  16594. write(temp_buffer.data(), temp_buffer.size());
  16595. }
  16596. size_t get_size_written() override {
  16597. return size_written;
  16598. }
  16599. };
  16600. struct llama_data_read_file : llama_data_read {
  16601. llama_file * file;
  16602. size_t size_read = 0;
  16603. std::vector<uint8_t> temp_buffer;
  16604. llama_data_read_file(llama_file * f) : file(f) {}
  16605. void read_to(void * dst, size_t size) override {
  16606. file->read_raw(dst, size);
  16607. size_read += size;
  16608. }
  16609. const uint8_t * read(size_t size) override {
  16610. temp_buffer.resize(size);
  16611. read_to(temp_buffer.data(), size);
  16612. return temp_buffer.data();
  16613. }
  16614. size_t get_size_read() override {
  16615. return size_read;
  16616. }
  16617. };
  16618. /** copy state data into either a buffer or file depending on the passed in context
  16619. *
  16620. * file context:
  16621. * llama_file file("/path", "wb");
  16622. * llama_data_write_file data_ctx(&file);
  16623. * llama_state_get_data_internal(ctx, data_ctx);
  16624. *
  16625. * buffer context:
  16626. * std::vector<uint8_t> buf(max_size, 0);
  16627. * llama_data_write_buffer data_ctx(buf.data(), max_size);
  16628. * llama_state_get_data_internal(ctx, data_ctx);
  16629. *
  16630. */
  16631. static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
  16632. llama_synchronize(ctx);
  16633. data_ctx.write_model_info(ctx);
  16634. data_ctx.write_rng(ctx->sampling.rng);
  16635. // copy outputs
  16636. data_ctx.write_output_ids(ctx);
  16637. data_ctx.write_logits(ctx);
  16638. data_ctx.write_embeddings(ctx);
  16639. data_ctx.write_kv_cache(ctx);
  16640. return data_ctx.get_size_written();
  16641. }
  16642. size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
  16643. llama_data_write_buffer data_ctx(dst, size);
  16644. try {
  16645. return llama_state_get_data_internal(ctx, data_ctx);
  16646. } catch (const std::exception & err) {
  16647. LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
  16648. return 0;
  16649. }
  16650. }
  16651. // Returns the *actual* size of the state.
  16652. // Intended to be used when saving to state to a buffer.
  16653. size_t llama_state_get_size(struct llama_context * ctx) {
  16654. llama_data_write_dummy data_ctx;
  16655. try {
  16656. return llama_state_get_data_internal(ctx, data_ctx);
  16657. } catch (const std::exception & err) {
  16658. LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
  16659. return 0;
  16660. }
  16661. }
  16662. static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
  16663. llama_synchronize(ctx);
  16664. data_ctx.read_model_info(ctx);
  16665. // set rng
  16666. data_ctx.read_rng(ctx->sampling.rng);
  16667. // set outputs
  16668. data_ctx.read_output_ids(ctx);
  16669. data_ctx.read_logits(ctx);
  16670. data_ctx.read_embeddings(ctx);
  16671. data_ctx.read_kv_cache(ctx);
  16672. return data_ctx.get_size_read();
  16673. }
  16674. // Sets the state reading from the specified source address
  16675. size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
  16676. llama_data_read_buffer data_ctx(src, size);
  16677. try {
  16678. return llama_state_set_data_internal(ctx, data_ctx);
  16679. } catch (const std::exception & err) {
  16680. LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
  16681. return 0;
  16682. }
  16683. }
  16684. static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  16685. llama_file file(path_session, "rb");
  16686. // sanity checks
  16687. {
  16688. const uint32_t magic = file.read_u32();
  16689. const uint32_t version = file.read_u32();
  16690. if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
  16691. LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
  16692. return false;
  16693. }
  16694. }
  16695. // load the prompt
  16696. {
  16697. const uint32_t n_token_count = file.read_u32();
  16698. if (n_token_count > n_token_capacity) {
  16699. LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  16700. return false;
  16701. }
  16702. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  16703. *n_token_count_out = n_token_count;
  16704. }
  16705. // restore the context state
  16706. {
  16707. const size_t n_state_size_cur = file.size - file.tell();
  16708. llama_data_read_file data_ctx(&file);
  16709. const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
  16710. if (n_read != n_state_size_cur) {
  16711. LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
  16712. return false;
  16713. }
  16714. }
  16715. return true;
  16716. }
  16717. bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  16718. try {
  16719. return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  16720. } catch (const std::exception & err) {
  16721. LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
  16722. return false;
  16723. }
  16724. }
  16725. static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  16726. llama_file file(path_session, "wb");
  16727. file.write_u32(LLAMA_SESSION_MAGIC);
  16728. file.write_u32(LLAMA_SESSION_VERSION);
  16729. // save the prompt
  16730. file.write_u32((uint32_t) n_token_count);
  16731. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  16732. // save the context state using stream saving
  16733. llama_data_write_file data_ctx(&file);
  16734. llama_state_get_data_internal(ctx, data_ctx);
  16735. return true;
  16736. }
  16737. bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  16738. try {
  16739. return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
  16740. } catch (const std::exception & err) {
  16741. LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
  16742. return false;
  16743. }
  16744. }
  16745. static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
  16746. llama_synchronize(ctx);
  16747. data_ctx.write_kv_cache(ctx, seq_id);
  16748. return data_ctx.get_size_written();
  16749. }
  16750. size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
  16751. llama_data_write_dummy data_ctx;
  16752. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  16753. }
  16754. size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
  16755. llama_data_write_buffer data_ctx(dst, size);
  16756. try {
  16757. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  16758. } catch (const std::exception & err) {
  16759. LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
  16760. return 0;
  16761. }
  16762. }
  16763. static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
  16764. llama_synchronize(ctx);
  16765. data_ctx.read_kv_cache(ctx, dest_seq_id);
  16766. return data_ctx.get_size_read();
  16767. }
  16768. size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
  16769. llama_data_read_buffer data_ctx(src, size);
  16770. try {
  16771. return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  16772. } catch (const std::exception & err) {
  16773. LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
  16774. return 0;
  16775. }
  16776. }
  16777. static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  16778. llama_file file(filepath, "wb");
  16779. file.write_u32(LLAMA_STATE_SEQ_MAGIC);
  16780. file.write_u32(LLAMA_STATE_SEQ_VERSION);
  16781. // save the prompt
  16782. file.write_u32((uint32_t) n_token_count);
  16783. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  16784. // save the context state using stream saving
  16785. llama_data_write_file data_ctx(&file);
  16786. llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  16787. const size_t res = file.tell();
  16788. GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
  16789. return res;
  16790. }
  16791. static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  16792. llama_file file(filepath, "rb");
  16793. // version checks
  16794. {
  16795. const uint32_t magic = file.read_u32();
  16796. const uint32_t version = file.read_u32();
  16797. if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
  16798. LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
  16799. return 0;
  16800. }
  16801. }
  16802. // load the prompt
  16803. {
  16804. const uint32_t n_token_count = file.read_u32();
  16805. if (n_token_count > n_token_capacity) {
  16806. LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  16807. return 0;
  16808. }
  16809. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  16810. *n_token_count_out = n_token_count;
  16811. }
  16812. // restore the context state
  16813. {
  16814. const size_t state_size = file.size - file.tell();
  16815. llama_data_read_file data_ctx(&file);
  16816. const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  16817. if (!nread) {
  16818. LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
  16819. return 0;
  16820. }
  16821. GGML_ASSERT(nread <= state_size);
  16822. GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
  16823. }
  16824. return file.tell();
  16825. }
  16826. size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  16827. try {
  16828. return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
  16829. } catch (const std::exception & err) {
  16830. LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
  16831. return 0;
  16832. }
  16833. }
  16834. size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  16835. try {
  16836. return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
  16837. } catch (const std::exception & err) {
  16838. LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
  16839. return 0;
  16840. }
  16841. }
  16842. void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
  16843. ctx->cparams.n_threads = n_threads;
  16844. ctx->cparams.n_threads_batch = n_threads_batch;
  16845. }
  16846. int32_t llama_n_threads(struct llama_context * ctx) {
  16847. return ctx->cparams.n_threads;
  16848. }
  16849. int32_t llama_n_threads_batch(struct llama_context * ctx) {
  16850. return ctx->cparams.n_threads_batch;
  16851. }
  16852. void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
  16853. ctx->abort_callback = abort_callback;
  16854. ctx->abort_callback_data = abort_callback_data;
  16855. }
  16856. void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
  16857. ctx->cparams.embeddings = embeddings;
  16858. }
  16859. void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
  16860. ctx->cparams.causal_attn = causal_attn;
  16861. }
  16862. struct llama_batch llama_batch_get_one(
  16863. llama_token * tokens,
  16864. int32_t n_tokens,
  16865. llama_pos pos_0,
  16866. llama_seq_id seq_id) {
  16867. return {
  16868. /*n_tokens =*/ n_tokens,
  16869. /*tokens =*/ tokens,
  16870. /*embd =*/ nullptr,
  16871. /*pos =*/ nullptr,
  16872. /*n_seq_id =*/ nullptr,
  16873. /*seq_id =*/ nullptr,
  16874. /*logits =*/ nullptr,
  16875. /*all_pos_0 =*/ pos_0,
  16876. /*all_pos_1 =*/ 1,
  16877. /*all_seq_id =*/ seq_id,
  16878. };
  16879. }
  16880. struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
  16881. llama_batch batch = {
  16882. /*n_tokens =*/ 0,
  16883. /*tokens =*/ nullptr,
  16884. /*embd =*/ nullptr,
  16885. /*pos =*/ nullptr,
  16886. /*n_seq_id =*/ nullptr,
  16887. /*seq_id =*/ nullptr,
  16888. /*logits =*/ nullptr,
  16889. /*all_pos_0 =*/ 0,
  16890. /*all_pos_1 =*/ 0,
  16891. /*all_seq_id =*/ 0,
  16892. };
  16893. if (embd) {
  16894. batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
  16895. } else {
  16896. batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
  16897. }
  16898. batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
  16899. batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
  16900. batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
  16901. for (int i = 0; i < n_tokens_alloc; ++i) {
  16902. batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
  16903. }
  16904. batch.seq_id[n_tokens_alloc] = nullptr;
  16905. batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
  16906. return batch;
  16907. }
  16908. void llama_batch_free(struct llama_batch batch) {
  16909. if (batch.token) free(batch.token);
  16910. if (batch.embd) free(batch.embd);
  16911. if (batch.pos) free(batch.pos);
  16912. if (batch.n_seq_id) free(batch.n_seq_id);
  16913. if (batch.seq_id) {
  16914. for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
  16915. free(batch.seq_id[i]);
  16916. }
  16917. free(batch.seq_id);
  16918. }
  16919. if (batch.logits) free(batch.logits);
  16920. }
  16921. int32_t llama_encode(
  16922. struct llama_context * ctx,
  16923. struct llama_batch batch) {
  16924. const int ret = llama_encode_internal(*ctx, batch);
  16925. if (ret < 0) {
  16926. LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
  16927. }
  16928. return ret;
  16929. }
  16930. int32_t llama_decode(
  16931. struct llama_context * ctx,
  16932. struct llama_batch batch) {
  16933. const int ret = llama_decode_internal(*ctx, batch);
  16934. if (ret < 0) {
  16935. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  16936. }
  16937. return ret;
  16938. }
  16939. void llama_synchronize(struct llama_context * ctx) {
  16940. ggml_backend_sched_synchronize(ctx->sched);
  16941. // FIXME: if multiple single tokens are evaluated without a synchronization,
  16942. // the stats will be added to the prompt evaluation stats
  16943. // this should only happen when using batch size 1 to evaluate a batch
  16944. // add the evaluation to the stats
  16945. if (ctx->n_queued_tokens == 1) {
  16946. ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  16947. ctx->n_eval++;
  16948. } else if (ctx->n_queued_tokens > 1) {
  16949. ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  16950. ctx->n_p_eval += ctx->n_queued_tokens;
  16951. }
  16952. // get a more accurate load time, upon first eval
  16953. if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
  16954. ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
  16955. ctx->has_evaluated_once = true;
  16956. }
  16957. ctx->n_queued_tokens = 0;
  16958. ctx->t_compute_start_us = 0;
  16959. }
  16960. float * llama_get_logits(struct llama_context * ctx) {
  16961. llama_synchronize(ctx);
  16962. // reorder logits for backward compatibility
  16963. // TODO: maybe deprecate this
  16964. llama_output_reorder(ctx);
  16965. return ctx->logits;
  16966. }
  16967. float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
  16968. int32_t j = -1;
  16969. llama_synchronize(ctx);
  16970. try {
  16971. if (ctx->logits == nullptr) {
  16972. throw std::runtime_error("no logits");
  16973. }
  16974. if (i < 0) {
  16975. j = ctx->n_outputs + i;
  16976. if (j < 0) {
  16977. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  16978. }
  16979. } else if ((size_t) i >= ctx->output_ids.size()) {
  16980. throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
  16981. } else {
  16982. j = ctx->output_ids[i];
  16983. }
  16984. if (j < 0) {
  16985. throw std::runtime_error(format("batch.logits[%d] != true", i));
  16986. }
  16987. if (j >= ctx->n_outputs) {
  16988. // This should not happen
  16989. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  16990. }
  16991. return ctx->logits + j*ctx->model.hparams.n_vocab;
  16992. } catch (const std::exception & err) {
  16993. LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
  16994. #ifndef NDEBUG
  16995. GGML_ABORT("fatal error");
  16996. #endif
  16997. return nullptr;
  16998. }
  16999. }
  17000. float * llama_get_embeddings(struct llama_context * ctx) {
  17001. llama_synchronize(ctx);
  17002. // reorder embeddings for backward compatibility
  17003. // TODO: maybe deprecate this
  17004. llama_output_reorder(ctx);
  17005. return ctx->embd;
  17006. }
  17007. float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
  17008. int32_t j = -1;
  17009. llama_synchronize(ctx);
  17010. try {
  17011. if (ctx->embd == nullptr) {
  17012. throw std::runtime_error("no embeddings");
  17013. }
  17014. if (i < 0) {
  17015. j = ctx->n_outputs + i;
  17016. if (j < 0) {
  17017. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  17018. }
  17019. } else if ((size_t) i >= ctx->output_ids.size()) {
  17020. throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
  17021. } else {
  17022. j = ctx->output_ids[i];
  17023. }
  17024. if (j < 0) {
  17025. throw std::runtime_error(format("batch.logits[%d] != true", i));
  17026. }
  17027. if (j >= ctx->n_outputs) {
  17028. // This should not happen
  17029. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  17030. }
  17031. return ctx->embd + j*ctx->model.hparams.n_embd;
  17032. } catch (const std::exception & err) {
  17033. LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
  17034. #ifndef NDEBUG
  17035. GGML_ABORT("fatal error");
  17036. #endif
  17037. return nullptr;
  17038. }
  17039. }
  17040. float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
  17041. llama_synchronize(ctx);
  17042. auto it = ctx->embd_seq.find(seq_id);
  17043. if (it == ctx->embd_seq.end()) {
  17044. return nullptr;
  17045. }
  17046. return it->second.data();
  17047. }
  17048. //
  17049. // vocab
  17050. //
  17051. const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
  17052. return llama_token_get_text_impl(model->vocab, token);
  17053. }
  17054. float llama_token_get_score(const struct llama_model * model, llama_token token) {
  17055. return llama_token_get_score_impl(model->vocab, token);
  17056. }
  17057. enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
  17058. return llama_token_get_attr_impl(model->vocab, token);
  17059. }
  17060. bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
  17061. return llama_token_is_eog_impl(model->vocab, token);
  17062. }
  17063. bool llama_token_is_control(const struct llama_model * model, llama_token token) {
  17064. return llama_token_is_control_impl(model->vocab, token);
  17065. }
  17066. llama_token llama_token_bos(const struct llama_model * model) {
  17067. return llama_token_bos_impl(model->vocab);
  17068. }
  17069. llama_token llama_token_eos(const struct llama_model * model) {
  17070. return llama_token_eos_impl(model->vocab);
  17071. }
  17072. llama_token llama_token_cls(const struct llama_model * model) {
  17073. return llama_token_cls_impl(model->vocab);
  17074. }
  17075. llama_token llama_token_sep(const struct llama_model * model) {
  17076. return llama_token_sep_impl(model->vocab);
  17077. }
  17078. llama_token llama_token_nl (const struct llama_model * model) {
  17079. return llama_token_nl_impl(model->vocab);
  17080. }
  17081. llama_token llama_token_pad(const struct llama_model * model) {
  17082. return llama_token_pad_impl(model->vocab);
  17083. }
  17084. bool llama_add_bos_token(const struct llama_model * model) {
  17085. return llama_add_bos_token_impl(model->vocab);
  17086. }
  17087. bool llama_add_eos_token(const struct llama_model * model) {
  17088. return llama_add_eos_token_impl(model->vocab);
  17089. }
  17090. llama_token llama_token_prefix(const struct llama_model * model) {
  17091. return llama_token_prefix_impl(model->vocab);
  17092. }
  17093. llama_token llama_token_middle(const struct llama_model * model) {
  17094. return llama_token_middle_impl(model->vocab);
  17095. }
  17096. llama_token llama_token_suffix(const struct llama_model * model) {
  17097. return llama_token_suffix_impl(model->vocab);
  17098. }
  17099. llama_token llama_token_eot(const struct llama_model * model) {
  17100. return llama_token_eot_impl(model->vocab);
  17101. }
  17102. //
  17103. // tokenization
  17104. //
  17105. int32_t llama_tokenize(
  17106. const struct llama_model * model,
  17107. const char * text,
  17108. int32_t text_len,
  17109. llama_token * tokens,
  17110. int32_t n_tokens_max,
  17111. bool add_special,
  17112. bool parse_special) {
  17113. return llama_tokenize_impl(model->vocab, text, text_len, tokens, n_tokens_max, add_special, parse_special);
  17114. }
  17115. int32_t llama_token_to_piece(
  17116. const struct llama_model * model,
  17117. llama_token token,
  17118. char * buf,
  17119. int32_t length,
  17120. int32_t lstrip,
  17121. bool special) {
  17122. return llama_token_to_piece_impl(model->vocab, token, buf, length, lstrip, special);
  17123. }
  17124. int32_t llama_detokenize(
  17125. const struct llama_model * model,
  17126. const llama_token * tokens,
  17127. int32_t n_tokens,
  17128. char * text,
  17129. int32_t text_len_max,
  17130. bool remove_special,
  17131. bool unparse_special) {
  17132. return llama_detokenize_impl(model->vocab, tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
  17133. }
  17134. //
  17135. // chat templates
  17136. //
  17137. // Simple version of "llama_apply_chat_template" that only works with strings
  17138. // This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
  17139. static int32_t llama_chat_apply_template_internal(
  17140. const std::string & tmpl,
  17141. const std::vector<const llama_chat_message *> & chat,
  17142. std::string & dest, bool add_ass) {
  17143. // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
  17144. std::stringstream ss;
  17145. auto tmpl_contains = [&tmpl](std::string haystack) -> bool {
  17146. return tmpl.find(haystack) != std::string::npos;
  17147. };
  17148. if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
  17149. // chatml template
  17150. for (auto message : chat) {
  17151. ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
  17152. }
  17153. if (add_ass) {
  17154. ss << "<|im_start|>assistant\n";
  17155. }
  17156. } else if (tmpl == "llama2" || tmpl == "mistral" || tmpl_contains("[INST]")) {
  17157. // llama2 template and its variants
  17158. // [variant] support system message
  17159. bool support_system_message = tmpl_contains("<<SYS>>") || tmpl == "mistral";
  17160. // [variant] space before + after response
  17161. bool space_around_response = tmpl_contains("' ' + eos_token");
  17162. // [variant] add BOS inside history
  17163. bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
  17164. // [variant] trim spaces from the input message
  17165. bool strip_message = tmpl_contains("content.strip()");
  17166. // construct the prompt
  17167. bool is_inside_turn = true; // skip BOS at the beginning
  17168. ss << "[INST] ";
  17169. for (auto message : chat) {
  17170. std::string content = strip_message ? trim(message->content) : message->content;
  17171. std::string role(message->role);
  17172. if (!is_inside_turn) {
  17173. is_inside_turn = true;
  17174. ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
  17175. }
  17176. if (role == "system") {
  17177. if (support_system_message) {
  17178. ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
  17179. } else {
  17180. // if the model does not support system message, we still include it in the first message, but without <<SYS>>
  17181. ss << content << "\n";
  17182. }
  17183. } else if (role == "user") {
  17184. ss << content << " [/INST]";
  17185. } else {
  17186. ss << (space_around_response ? " " : "") << content << (space_around_response ? " " : "") << "</s>";
  17187. is_inside_turn = false;
  17188. }
  17189. }
  17190. // llama2 templates seem to not care about "add_generation_prompt"
  17191. } else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
  17192. // Phi 3
  17193. for (auto message : chat) {
  17194. std::string role(message->role);
  17195. ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
  17196. }
  17197. if (add_ass) {
  17198. ss << "<|assistant|>\n";
  17199. }
  17200. } else if (tmpl == "zephyr" || tmpl_contains("<|user|>")) {
  17201. // zephyr template
  17202. for (auto message : chat) {
  17203. ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
  17204. }
  17205. if (add_ass) {
  17206. ss << "<|assistant|>\n";
  17207. }
  17208. } else if (tmpl == "monarch" || tmpl_contains("bos_token + message['role']")) {
  17209. // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
  17210. for (auto message : chat) {
  17211. std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
  17212. ss << bos << message->role << "\n" << message->content << "</s>\n";
  17213. }
  17214. if (add_ass) {
  17215. ss << "<s>assistant\n";
  17216. }
  17217. } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl_contains("<start_of_turn>")) {
  17218. // google/gemma-7b-it
  17219. std::string system_prompt = "";
  17220. for (auto message : chat) {
  17221. std::string role(message->role);
  17222. if (role == "system") {
  17223. // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
  17224. system_prompt = trim(message->content);
  17225. continue;
  17226. }
  17227. // in gemma, "assistant" is "model"
  17228. role = role == "assistant" ? "model" : message->role;
  17229. ss << "<start_of_turn>" << role << "\n";
  17230. if (!system_prompt.empty() && role != "model") {
  17231. ss << system_prompt << "\n\n";
  17232. system_prompt = "";
  17233. }
  17234. ss << trim(message->content) << "<end_of_turn>\n";
  17235. }
  17236. if (add_ass) {
  17237. ss << "<start_of_turn>model\n";
  17238. }
  17239. } else if (tmpl == "orion" || tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
  17240. // OrionStarAI/Orion-14B-Chat
  17241. std::string system_prompt = "";
  17242. for (auto message : chat) {
  17243. std::string role(message->role);
  17244. if (role == "system") {
  17245. // there is no system message support, we will merge it with user prompt
  17246. system_prompt = message->content;
  17247. continue;
  17248. } else if (role == "user") {
  17249. ss << "Human: ";
  17250. if (!system_prompt.empty()) {
  17251. ss << system_prompt << "\n\n";
  17252. system_prompt = "";
  17253. }
  17254. ss << message->content << "\n\nAssistant: </s>";
  17255. } else {
  17256. ss << message->content << "</s>";
  17257. }
  17258. }
  17259. } else if (tmpl == "openchat" || tmpl_contains("GPT4 Correct ")) {
  17260. // openchat/openchat-3.5-0106,
  17261. for (auto message : chat) {
  17262. std::string role(message->role);
  17263. if (role == "system") {
  17264. ss << message->content << "<|end_of_turn|>";
  17265. } else {
  17266. role[0] = toupper(role[0]);
  17267. ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
  17268. }
  17269. }
  17270. if (add_ass) {
  17271. ss << "GPT4 Correct Assistant:";
  17272. }
  17273. } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: "))) {
  17274. // eachadea/vicuna-13b-1.1 (and Orca variant)
  17275. for (auto message : chat) {
  17276. std::string role(message->role);
  17277. if (role == "system") {
  17278. // Orca-Vicuna variant uses a system prefix
  17279. if (tmpl == "vicuna-orca" || tmpl_contains("SYSTEM: ")) {
  17280. ss << "SYSTEM: " << message->content << "\n";
  17281. } else {
  17282. ss << message->content << "\n\n";
  17283. }
  17284. } else if (role == "user") {
  17285. ss << "USER: " << message->content << "\n";
  17286. } else if (role == "assistant") {
  17287. ss << "ASSISTANT: " << message->content << "</s>\n";
  17288. }
  17289. }
  17290. if (add_ass) {
  17291. ss << "ASSISTANT:";
  17292. }
  17293. } else if (tmpl == "deepseek" || (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>"))) {
  17294. // deepseek-ai/deepseek-coder-33b-instruct
  17295. for (auto message : chat) {
  17296. std::string role(message->role);
  17297. if (role == "system") {
  17298. ss << message->content;
  17299. } else if (role == "user") {
  17300. ss << "### Instruction:\n" << message->content << "\n";
  17301. } else if (role == "assistant") {
  17302. ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
  17303. }
  17304. }
  17305. if (add_ass) {
  17306. ss << "### Response:\n";
  17307. }
  17308. } else if (tmpl == "command-r" || (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>"))) {
  17309. // CohereForAI/c4ai-command-r-plus
  17310. for (auto message : chat) {
  17311. std::string role(message->role);
  17312. if (role == "system") {
  17313. ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
  17314. } else if (role == "user") {
  17315. ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
  17316. } else if (role == "assistant") {
  17317. ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
  17318. }
  17319. }
  17320. if (add_ass) {
  17321. ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
  17322. }
  17323. } else if (tmpl == "llama3" || (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>"))) {
  17324. // Llama 3
  17325. for (auto message : chat) {
  17326. std::string role(message->role);
  17327. ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
  17328. }
  17329. if (add_ass) {
  17330. ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
  17331. }
  17332. } else if (tmpl == "chatglm3" || tmpl_contains("[gMASK]sop")) {
  17333. // chatglm3-6b
  17334. ss << "[gMASK]" << "sop";
  17335. for (auto message : chat) {
  17336. std::string role(message->role);
  17337. ss << "<|" << role << "|>" << "\n " << message->content;
  17338. }
  17339. if (add_ass) {
  17340. ss << "<|assistant|>";
  17341. }
  17342. } else if (tmpl == "chatglm4" || tmpl_contains("[gMASK]<sop>")) {
  17343. ss << "[gMASK]" << "<sop>";
  17344. for (auto message : chat) {
  17345. std::string role(message->role);
  17346. ss << "<|" << role << "|>" << "\n" << message->content;
  17347. }
  17348. if (add_ass) {
  17349. ss << "<|assistant|>";
  17350. }
  17351. } else if (tmpl == "minicpm" || tmpl_contains(LU8("<用户>"))) {
  17352. // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
  17353. for (auto message : chat) {
  17354. std::string role(message->role);
  17355. if (role == "user") {
  17356. ss << LU8("<用户>");
  17357. ss << trim(message->content);
  17358. ss << "<AI>";
  17359. } else {
  17360. ss << trim(message->content);
  17361. }
  17362. }
  17363. } else if (tmpl == "deepseek2" || tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
  17364. // DeepSeek-V2
  17365. for (auto message : chat) {
  17366. std::string role(message->role);
  17367. if (role == "system") {
  17368. ss << message->content << "\n\n";
  17369. } else if (role == "user") {
  17370. ss << "User: " << message->content << "\n\n";
  17371. } else if (role == "assistant") {
  17372. ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
  17373. }
  17374. }
  17375. if (add_ass) {
  17376. ss << "Assistant:";
  17377. }
  17378. } else if (tmpl == "exaone3" || (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]"))) {
  17379. // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
  17380. // EXAONE-3.0-7.8B-Instruct
  17381. for (auto message : chat) {
  17382. std::string role(message->role);
  17383. if (role == "system") {
  17384. ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
  17385. } else if (role == "user") {
  17386. ss << "[|user|]" << trim(message->content) << "\n";
  17387. } else if (role == "assistant") {
  17388. ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
  17389. }
  17390. }
  17391. if (add_ass) {
  17392. ss << "[|assistant|]";
  17393. }
  17394. } else {
  17395. // template not supported
  17396. return -1;
  17397. }
  17398. dest = ss.str();
  17399. return dest.size();
  17400. }
  17401. int32_t llama_chat_apply_template(
  17402. const struct llama_model * model,
  17403. const char * tmpl,
  17404. const struct llama_chat_message * chat,
  17405. size_t n_msg,
  17406. bool add_ass,
  17407. char * buf,
  17408. int32_t length) {
  17409. std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
  17410. if (tmpl == nullptr) {
  17411. GGML_ASSERT(model != nullptr);
  17412. // load template from model
  17413. std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
  17414. std::string template_key = "tokenizer.chat_template";
  17415. int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
  17416. if (res < 0) {
  17417. // worst case: there is no information about template, we will use chatml by default
  17418. curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
  17419. } else {
  17420. curr_tmpl = std::string(model_template.data(), model_template.size());
  17421. }
  17422. }
  17423. // format the chat to string
  17424. std::vector<const llama_chat_message *> chat_vec;
  17425. chat_vec.resize(n_msg);
  17426. for (size_t i = 0; i < n_msg; i++) {
  17427. chat_vec[i] = &chat[i];
  17428. }
  17429. std::string formatted_chat;
  17430. int32_t res = llama_chat_apply_template_internal(curr_tmpl, chat_vec, formatted_chat, add_ass);
  17431. if (res < 0) {
  17432. return res;
  17433. }
  17434. if (buf && length > 0) {
  17435. strncpy(buf, formatted_chat.c_str(), length);
  17436. }
  17437. return res;
  17438. }
  17439. //
  17440. // grammar
  17441. //
  17442. struct llama_grammar * llama_grammar_init(
  17443. const llama_grammar_element ** rules,
  17444. size_t n_rules,
  17445. size_t start_rule_index) {
  17446. return llama_grammar_init_impl(rules, n_rules, start_rule_index);
  17447. }
  17448. void llama_grammar_free(struct llama_grammar * grammar) {
  17449. llama_grammar_free_impl(grammar);
  17450. }
  17451. struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
  17452. return llama_grammar_copy_impl(grammar);
  17453. }
  17454. void llama_grammar_sample(
  17455. const struct llama_grammar * grammar,
  17456. const struct llama_context * ctx,
  17457. llama_token_data_array * candidates) {
  17458. llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
  17459. }
  17460. void llama_sample_grammar(
  17461. struct llama_context * ctx,
  17462. llama_token_data_array * candidates,
  17463. const struct llama_grammar * grammar) {
  17464. llama_grammar_sample(grammar, ctx, candidates);
  17465. }
  17466. void llama_grammar_accept_token(
  17467. struct llama_grammar * grammar,
  17468. struct llama_context * ctx,
  17469. llama_token token) {
  17470. llama_grammar_accept_token_impl(grammar, &ctx->model.vocab, &ctx->sampling, token);
  17471. }
  17472. //
  17473. // sampling
  17474. //
  17475. void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
  17476. llama_set_rng_seed_impl(&ctx->sampling, seed);
  17477. }
  17478. void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
  17479. llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates);
  17480. }
  17481. void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
  17482. llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
  17483. }
  17484. void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  17485. llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
  17486. }
  17487. void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  17488. llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
  17489. }
  17490. void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
  17491. llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
  17492. }
  17493. void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
  17494. llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
  17495. }
  17496. void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
  17497. llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
  17498. }
  17499. void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
  17500. llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
  17501. }
  17502. void llama_sample_repetition_penalties(
  17503. struct llama_context * ctx,
  17504. llama_token_data_array * candidates,
  17505. const llama_token * last_tokens,
  17506. size_t penalty_last_n,
  17507. float penalty_repeat,
  17508. float penalty_freq,
  17509. float penalty_present) {
  17510. llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
  17511. }
  17512. void llama_sample_apply_guidance(
  17513. struct llama_context * ctx,
  17514. float * logits,
  17515. float * logits_guidance,
  17516. float scale) {
  17517. llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
  17518. }
  17519. llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
  17520. return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
  17521. }
  17522. llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
  17523. return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
  17524. }
  17525. llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
  17526. return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
  17527. }
  17528. llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
  17529. return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
  17530. }
  17531. llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
  17532. return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
  17533. }
  17534. int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
  17535. static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
  17536. if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
  17537. return strlen(split_path);
  17538. }
  17539. return 0;
  17540. }
  17541. int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
  17542. std::string str_split_path(split_path);
  17543. char postfix[32];
  17544. snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
  17545. std::string str_postfix(postfix);
  17546. // check if dest ends with postfix
  17547. int size_prefix = str_split_path.size() - str_postfix.size();
  17548. if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
  17549. snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
  17550. return size_prefix;
  17551. }
  17552. return 0;
  17553. }
  17554. struct llama_timings llama_get_timings(struct llama_context * ctx) {
  17555. struct llama_timings result = {
  17556. /*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
  17557. /*.t_end_ms =*/ 1.00 * ggml_time_ms(),
  17558. /*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
  17559. /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
  17560. /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
  17561. /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
  17562. /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
  17563. /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
  17564. /*.n_eval =*/ std::max(1, ctx->n_eval),
  17565. };
  17566. return result;
  17567. }
  17568. void llama_print_timings(struct llama_context * ctx) {
  17569. const llama_timings timings = llama_get_timings(ctx);
  17570. LLAMA_LOG_INFO("\n");
  17571. LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
  17572. LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  17573. __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
  17574. LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
  17575. __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
  17576. LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  17577. __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
  17578. LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
  17579. }
  17580. void llama_reset_timings(struct llama_context * ctx) {
  17581. ctx->t_start_us = ggml_time_us();
  17582. ctx->t_eval_us = ctx->n_eval = 0;
  17583. ctx->t_p_eval_us = ctx->n_p_eval = 0;
  17584. ctx->sampling.reset_timings();
  17585. }
  17586. const char * llama_print_system_info(void) {
  17587. static std::string s;
  17588. s = "";
  17589. s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
  17590. s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
  17591. s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
  17592. s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
  17593. s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
  17594. s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
  17595. s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
  17596. s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
  17597. s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
  17598. s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
  17599. s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
  17600. s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
  17601. s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
  17602. s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
  17603. s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
  17604. s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
  17605. s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
  17606. s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
  17607. s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
  17608. s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
  17609. return s.c_str();
  17610. }
  17611. void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
  17612. fprintf(stream, "\n");
  17613. fprintf(stream, "###########\n");
  17614. fprintf(stream, "# Timings #\n");
  17615. fprintf(stream, "###########\n");
  17616. fprintf(stream, "\n");
  17617. fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
  17618. 1.0e-3 * ctx->t_eval_us / ctx->n_eval);
  17619. fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
  17620. 1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
  17621. fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
  17622. 1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
  17623. fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
  17624. fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
  17625. fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
  17626. fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
  17627. fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
  17628. fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
  17629. fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
  17630. fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
  17631. 1.0e6 * ctx->n_eval / ctx->t_eval_us);
  17632. fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
  17633. 1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
  17634. fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
  17635. 1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
  17636. }
  17637. // For internal test use
  17638. const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
  17639. struct llama_context * ctx
  17640. ) {
  17641. return ctx->model.tensors_by_name;
  17642. }
  17643. void llama_log_set(ggml_log_callback log_callback, void * user_data) {
  17644. g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
  17645. g_state.log_callback_user_data = user_data;
  17646. #ifdef GGML_USE_METAL
  17647. ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
  17648. #elif defined(GGML_USE_CUDA)
  17649. ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
  17650. #elif defined(GGML_USE_CANN)
  17651. ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
  17652. #endif
  17653. }
  17654. static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
  17655. va_list args_copy;
  17656. va_copy(args_copy, args);
  17657. char buffer[128];
  17658. int len = vsnprintf(buffer, 128, format, args);
  17659. if (len < 128) {
  17660. g_state.log_callback(level, buffer, g_state.log_callback_user_data);
  17661. } else {
  17662. char* buffer2 = new char[len+1];
  17663. vsnprintf(buffer2, len+1, format, args_copy);
  17664. buffer2[len] = 0;
  17665. g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
  17666. delete[] buffer2;
  17667. }
  17668. va_end(args_copy);
  17669. }
  17670. void llama_log_internal(ggml_log_level level, const char * format, ...) {
  17671. va_list args;
  17672. va_start(args, format);
  17673. llama_log_internal_v(level, format, args);
  17674. va_end(args);
  17675. }
  17676. void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
  17677. (void) level;
  17678. (void) user_data;
  17679. fputs(text, stderr);
  17680. fflush(stderr);
  17681. }