ggml.c 246 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741
  1. /**
  2. * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
  27. #define _USE_MATH_DEFINES // For M_PI on MSVC
  28. #include "ggml-backend.h"
  29. #include "ggml-impl.h"
  30. #include "ggml-threading.h"
  31. #include "ggml.h"
  32. // FIXME: required here for quantization functions
  33. #include "ggml-quants.h"
  34. #ifdef GGML_USE_CPU_HBM
  35. #include <hbwmalloc.h>
  36. #endif
  37. #if defined(_MSC_VER) || defined(__MINGW32__)
  38. #include <malloc.h> // using malloc.h with MSC/MINGW
  39. #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
  40. #include <alloca.h>
  41. #endif
  42. #include <assert.h>
  43. #include <errno.h>
  44. #include <time.h>
  45. #include <math.h>
  46. #include <stdlib.h>
  47. #include <string.h>
  48. #include <stdint.h>
  49. #include <inttypes.h>
  50. #include <stdio.h>
  51. #include <float.h>
  52. #include <limits.h>
  53. #include <stdarg.h>
  54. #include <signal.h>
  55. #if defined(__gnu_linux__)
  56. #include <syscall.h>
  57. #endif
  58. #if defined(__APPLE__)
  59. #include <unistd.h>
  60. #include <mach/mach.h>
  61. #include <TargetConditionals.h>
  62. #endif
  63. #if defined(_WIN32)
  64. #define WIN32_LEAN_AND_MEAN
  65. #ifndef NOMINMAX
  66. #define NOMINMAX
  67. #endif
  68. #include <windows.h>
  69. #endif
  70. #define UNUSED GGML_UNUSED
  71. #if defined(_MSC_VER)
  72. #define m512bh(p) p
  73. #define m512i(p) p
  74. #else
  75. #define m512bh(p) (__m512bh)(p)
  76. #define m512i(p) (__m512i)(p)
  77. #endif
  78. // precomputed f32 table for f16 (256 KB) (ggml-impl.h)
  79. float ggml_table_f32_f16[1 << 16];
  80. #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
  81. (!defined(TARGET_OS_TV) && !defined(TARGET_OS_WATCH))
  82. #include <unistd.h>
  83. #include <sys/types.h>
  84. #include <sys/stat.h>
  85. #include <sys/wait.h>
  86. #if defined(__ANDROID__)
  87. #include <unwind.h>
  88. #include <dlfcn.h>
  89. #include <stdio.h>
  90. struct backtrace_state {
  91. void ** current;
  92. void ** end;
  93. };
  94. static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
  95. struct backtrace_state * state = (struct backtrace_state *)arg;
  96. uintptr_t pc = _Unwind_GetIP(context);
  97. if (pc) {
  98. if (state->current == state->end) {
  99. return _URC_END_OF_STACK;
  100. } else {
  101. *state->current++ = (void*)pc;
  102. }
  103. }
  104. return _URC_NO_REASON;
  105. }
  106. static void ggml_print_backtrace_symbols(void) {
  107. const int max = 100;
  108. void* buffer[max];
  109. struct backtrace_state state = {buffer, buffer + max};
  110. _Unwind_Backtrace(unwind_callback, &state);
  111. int count = state.current - buffer;
  112. for (int idx = 0; idx < count; ++idx) {
  113. const void * addr = buffer[idx];
  114. const char * symbol = "";
  115. Dl_info info;
  116. if (dladdr(addr, &info) && info.dli_sname) {
  117. symbol = info.dli_sname;
  118. }
  119. fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
  120. }
  121. }
  122. #elif defined(__linux__) && defined(__GLIBC__)
  123. #include <execinfo.h>
  124. static void ggml_print_backtrace_symbols(void) {
  125. void * trace[100];
  126. int nptrs = backtrace(trace, sizeof(trace)/sizeof(trace[0]));
  127. backtrace_symbols_fd(trace, nptrs, STDERR_FILENO);
  128. }
  129. #else
  130. static void ggml_print_backtrace_symbols(void) {
  131. // platform not supported
  132. }
  133. #endif
  134. static void ggml_print_backtrace(void) {
  135. char attach[32];
  136. snprintf(attach, sizeof(attach), "attach %d", getpid());
  137. int pid = fork();
  138. if (pid == 0) {
  139. // try gdb
  140. execlp("gdb", "gdb", "--batch",
  141. "-ex", "set style enabled on",
  142. "-ex", attach,
  143. "-ex", "bt -frame-info source-and-location",
  144. "-ex", "detach",
  145. "-ex", "quit",
  146. (char *) NULL);
  147. // try lldb
  148. execlp("lldb", "lldb", "--batch",
  149. "-o", "bt",
  150. "-o", "quit",
  151. "-p", attach,
  152. (char *) NULL);
  153. exit(EXIT_FAILURE);
  154. } else {
  155. int wstatus;
  156. waitpid(pid, &wstatus, 0);
  157. if (WIFEXITED(wstatus)) {
  158. if (WEXITSTATUS(wstatus) == EXIT_FAILURE) {
  159. // gdb failed, fallback to backtrace_symbols
  160. ggml_print_backtrace_symbols();
  161. }
  162. }
  163. }
  164. }
  165. #else
  166. static void ggml_print_backtrace(void) {
  167. // platform not supported
  168. }
  169. #endif
  170. void ggml_abort(const char * file, int line, const char * fmt, ...) {
  171. fflush(stdout);
  172. fprintf(stderr, "%s:%d: ", file, line);
  173. va_list args;
  174. va_start(args, fmt);
  175. vfprintf(stderr, fmt, args);
  176. va_end(args);
  177. fprintf(stderr, "\n");
  178. ggml_print_backtrace();
  179. abort();
  180. }
  181. //
  182. // logging
  183. //
  184. struct ggml_logger_state {
  185. ggml_log_callback log_callback;
  186. void * log_callback_user_data;
  187. };
  188. static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
  189. static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
  190. if (format == NULL) {
  191. return;
  192. }
  193. va_list args_copy;
  194. va_copy(args_copy, args);
  195. char buffer[128];
  196. int len = vsnprintf(buffer, 128, format, args);
  197. if (len < 128) {
  198. g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
  199. } else {
  200. char * buffer2 = (char *) calloc(len + 1, sizeof(char));
  201. vsnprintf(buffer2, len + 1, format, args_copy);
  202. buffer2[len] = 0;
  203. g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
  204. free(buffer2);
  205. }
  206. va_end(args_copy);
  207. }
  208. void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
  209. va_list args;
  210. va_start(args, format);
  211. ggml_log_internal_v(level, format, args);
  212. va_end(args);
  213. }
  214. void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
  215. (void) level;
  216. (void) user_data;
  217. fputs(text, stderr);
  218. fflush(stderr);
  219. }
  220. //
  221. // end of logging block
  222. //
  223. #ifdef GGML_USE_ACCELERATE
  224. // uncomment to use vDSP for soft max computation
  225. // note: not sure if it is actually faster
  226. //#define GGML_SOFT_MAX_ACCELERATE
  227. #endif
  228. void * ggml_aligned_malloc(size_t size) {
  229. const int alignment = 64;
  230. #if defined(_MSC_VER) || defined(__MINGW32__)
  231. return _aligned_malloc(size, alignment);
  232. #else
  233. if (size == 0) {
  234. GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
  235. return NULL;
  236. }
  237. void * aligned_memory = NULL;
  238. #ifdef GGML_USE_CPU_HBM
  239. int result = hbw_posix_memalign(&aligned_memory, alignment, size);
  240. #elif TARGET_OS_OSX
  241. GGML_UNUSED(alignment);
  242. kern_return_t alloc_status = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t *) &aligned_memory, size, VM_FLAGS_ANYWHERE);
  243. int result = EFAULT;
  244. switch (alloc_status) {
  245. case KERN_SUCCESS:
  246. result = 0;
  247. break;
  248. case KERN_INVALID_ADDRESS:
  249. result = EINVAL;
  250. break;
  251. case KERN_NO_SPACE:
  252. result = ENOMEM;
  253. break;
  254. default:
  255. result = EFAULT;
  256. break;
  257. }
  258. #else
  259. int result = posix_memalign(&aligned_memory, alignment, size);
  260. #endif
  261. if (result != 0) {
  262. // Handle allocation failure
  263. const char *error_desc = "unknown allocation error";
  264. switch (result) {
  265. case EINVAL:
  266. error_desc = "invalid alignment value";
  267. break;
  268. case ENOMEM:
  269. error_desc = "insufficient memory";
  270. break;
  271. }
  272. GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
  273. return NULL;
  274. }
  275. return aligned_memory;
  276. #endif
  277. }
  278. void ggml_aligned_free(void * ptr, size_t size) {
  279. GGML_UNUSED(size);
  280. #if defined(_MSC_VER) || defined(__MINGW32__)
  281. _aligned_free(ptr);
  282. #elif GGML_USE_CPU_HBM
  283. if (ptr != NULL) {
  284. hbw_free(ptr);
  285. }
  286. #elif TARGET_OS_OSX
  287. if (ptr != NULL) {
  288. vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ptr, size);
  289. }
  290. #else
  291. free(ptr);
  292. #endif
  293. }
  294. inline static void * ggml_malloc(size_t size) {
  295. if (size == 0) {
  296. GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
  297. return NULL;
  298. }
  299. void * result = malloc(size);
  300. if (result == NULL) {
  301. GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
  302. GGML_ABORT("fatal error");
  303. }
  304. return result;
  305. }
  306. // calloc
  307. inline static void * ggml_calloc(size_t num, size_t size) {
  308. if (num == 0 || size == 0) {
  309. GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
  310. return NULL;
  311. }
  312. void * result = calloc(num, size);
  313. if (result == NULL) {
  314. GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
  315. GGML_ABORT("fatal error");
  316. }
  317. return result;
  318. }
  319. #define GGML_MALLOC(size) ggml_malloc(size)
  320. #define GGML_CALLOC(num, size) ggml_calloc(num, size)
  321. #define GGML_FREE(ptr) free(ptr)
  322. const char * ggml_status_to_string(enum ggml_status status) {
  323. switch (status) {
  324. case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
  325. case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
  326. case GGML_STATUS_SUCCESS: return "GGML status: success";
  327. case GGML_STATUS_ABORTED: return "GGML status: warning (operation aborted)";
  328. }
  329. return "GGML status: unknown";
  330. }
  331. float ggml_fp16_to_fp32(ggml_fp16_t x) {
  332. #define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml
  333. return GGML_FP16_TO_FP32(x);
  334. }
  335. ggml_fp16_t ggml_fp32_to_fp16(float x) {
  336. #define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml
  337. return GGML_FP32_TO_FP16(x);
  338. }
  339. float ggml_bf16_to_fp32(ggml_bf16_t x) {
  340. #define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml
  341. return GGML_BF16_TO_FP32(x); // it just left shifts
  342. }
  343. ggml_bf16_t ggml_fp32_to_bf16(float x) {
  344. #define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml
  345. return GGML_FP32_TO_BF16(x);
  346. }
  347. void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
  348. for (int64_t i = 0; i < n; i++) {
  349. y[i] = GGML_FP16_TO_FP32(x[i]);
  350. }
  351. }
  352. // FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
  353. // currently, the ggml_cpu_has_* functions are entirely compile-time
  354. void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
  355. int64_t i = 0;
  356. #if defined(__F16C__)
  357. //if (ggml_cpu_has_f16c()) {
  358. for (; i + 7 < n; i += 8) {
  359. __m256 x_vec = _mm256_loadu_ps(x + i);
  360. __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
  361. _mm_storeu_si128((__m128i *)(y + i), y_vec);
  362. }
  363. for(; i + 3 < n; i += 4) {
  364. __m128 x_vec = _mm_loadu_ps(x + i);
  365. __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
  366. _mm_storel_epi64((__m128i *)(y + i), y_vec);
  367. }
  368. //}
  369. #endif
  370. for (; i < n; i++) {
  371. y[i] = GGML_FP32_TO_FP16(x[i]);
  372. }
  373. }
  374. void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
  375. int64_t i = 0;
  376. #if defined(__AVX512F__)
  377. //if (ggml_cpu_has_avx512()) {
  378. for (; i + 16 <= n; i += 16) {
  379. _mm512_storeu_ps(y + i,
  380. _mm512_castsi512_ps(
  381. _mm512_slli_epi32(
  382. _mm512_cvtepu16_epi32(
  383. _mm256_loadu_si256(
  384. (const __m256i *)(x + i))),
  385. 16)));
  386. }
  387. //}
  388. #endif
  389. #if defined(__AVX2__)
  390. //if (ggml_cpu_has_avx2()) {
  391. for (; i + 8 <= n; i += 8) {
  392. _mm256_storeu_ps(y + i,
  393. _mm256_castsi256_ps(
  394. _mm256_slli_epi32(
  395. _mm256_cvtepu16_epi32(
  396. _mm_loadu_si128(
  397. (const __m128i *)(x + i))),
  398. 16)));
  399. }
  400. //}
  401. #endif
  402. for (; i < n; i++) {
  403. y[i] = GGML_BF16_TO_FP32(x[i]);
  404. }
  405. }
  406. void ggml_fp32_to_bf16_row_ref(const float * x, ggml_bf16_t * y, int64_t n) {
  407. for (int i = 0; i < n; i++) {
  408. y[i] = ggml_compute_fp32_to_bf16(x[i]);
  409. }
  410. }
  411. void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) {
  412. int i = 0;
  413. #if defined(__AVX512BF16__)
  414. // subnormals are flushed to zero on this platform
  415. for (; i + 32 <= n; i += 32) {
  416. _mm512_storeu_si512(
  417. (__m512i *)(y + i),
  418. m512i(_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16),
  419. _mm512_loadu_ps(x + i))));
  420. }
  421. #endif
  422. for (; i < n; i++) {
  423. y[i] = GGML_FP32_TO_BF16(x[i]);
  424. }
  425. }
  426. bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) {
  427. return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0;
  428. }
  429. //
  430. // timing
  431. //
  432. #if defined(_MSC_VER) || defined(__MINGW32__)
  433. static int64_t timer_freq, timer_start;
  434. void ggml_time_init(void) {
  435. LARGE_INTEGER t;
  436. QueryPerformanceFrequency(&t);
  437. timer_freq = t.QuadPart;
  438. // The multiplication by 1000 or 1000000 below can cause an overflow if timer_freq
  439. // and the uptime is high enough.
  440. // We subtract the program start time to reduce the likelihood of that happening.
  441. QueryPerformanceCounter(&t);
  442. timer_start = t.QuadPart;
  443. }
  444. int64_t ggml_time_ms(void) {
  445. LARGE_INTEGER t;
  446. QueryPerformanceCounter(&t);
  447. return ((t.QuadPart-timer_start) * 1000) / timer_freq;
  448. }
  449. int64_t ggml_time_us(void) {
  450. LARGE_INTEGER t;
  451. QueryPerformanceCounter(&t);
  452. return ((t.QuadPart-timer_start) * 1000000) / timer_freq;
  453. }
  454. #else
  455. void ggml_time_init(void) {}
  456. int64_t ggml_time_ms(void) {
  457. struct timespec ts;
  458. clock_gettime(CLOCK_MONOTONIC, &ts);
  459. return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
  460. }
  461. int64_t ggml_time_us(void) {
  462. struct timespec ts;
  463. clock_gettime(CLOCK_MONOTONIC, &ts);
  464. return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
  465. }
  466. #endif
  467. int64_t ggml_cycles(void) {
  468. return clock();
  469. }
  470. int64_t ggml_cycles_per_ms(void) {
  471. return CLOCKS_PER_SEC/1000;
  472. }
  473. //
  474. // cross-platform UTF-8 file paths
  475. //
  476. #ifdef _WIN32
  477. static wchar_t * ggml_mbstowcs(const char * mbs) {
  478. int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0);
  479. if (!wlen) {
  480. errno = EINVAL;
  481. return NULL;
  482. }
  483. wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t));
  484. wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen);
  485. if (!wlen) {
  486. GGML_FREE(wbuf);
  487. errno = EINVAL;
  488. return NULL;
  489. }
  490. return wbuf;
  491. }
  492. #endif
  493. FILE * ggml_fopen(const char * fname, const char * mode) {
  494. #ifdef _WIN32
  495. FILE * file = NULL;
  496. // convert fname (UTF-8)
  497. wchar_t * wfname = ggml_mbstowcs(fname);
  498. if (wfname) {
  499. // convert mode (ANSI)
  500. wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t));
  501. wchar_t * wmode_p = wmode;
  502. do {
  503. *wmode_p++ = (wchar_t)*mode;
  504. } while (*mode++);
  505. // open file
  506. file = _wfopen(wfname, wmode);
  507. GGML_FREE(wfname);
  508. GGML_FREE(wmode);
  509. }
  510. return file;
  511. #else
  512. return fopen(fname, mode);
  513. #endif
  514. }
  515. static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
  516. static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
  517. static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
  518. static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
  519. [GGML_TYPE_I8] = {
  520. .type_name = "i8",
  521. .blck_size = 1,
  522. .type_size = sizeof(int8_t),
  523. .is_quantized = false,
  524. },
  525. [GGML_TYPE_I16] = {
  526. .type_name = "i16",
  527. .blck_size = 1,
  528. .type_size = sizeof(int16_t),
  529. .is_quantized = false,
  530. },
  531. [GGML_TYPE_I32] = {
  532. .type_name = "i32",
  533. .blck_size = 1,
  534. .type_size = sizeof(int32_t),
  535. .is_quantized = false,
  536. },
  537. [GGML_TYPE_I64] = {
  538. .type_name = "i64",
  539. .blck_size = 1,
  540. .type_size = sizeof(int64_t),
  541. .is_quantized = false,
  542. },
  543. [GGML_TYPE_F64] = {
  544. .type_name = "f64",
  545. .blck_size = 1,
  546. .type_size = sizeof(double),
  547. .is_quantized = false,
  548. },
  549. [GGML_TYPE_F32] = {
  550. .type_name = "f32",
  551. .blck_size = 1,
  552. .type_size = sizeof(float),
  553. .is_quantized = false,
  554. },
  555. [GGML_TYPE_F16] = {
  556. .type_name = "f16",
  557. .blck_size = 1,
  558. .type_size = sizeof(ggml_fp16_t),
  559. .is_quantized = false,
  560. .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
  561. .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
  562. },
  563. [GGML_TYPE_Q4_0] = {
  564. .type_name = "q4_0",
  565. .blck_size = QK4_0,
  566. .type_size = sizeof(block_q4_0),
  567. .is_quantized = true,
  568. .to_float = (ggml_to_float_t) dequantize_row_q4_0,
  569. .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
  570. },
  571. [GGML_TYPE_Q4_1] = {
  572. .type_name = "q4_1",
  573. .blck_size = QK4_1,
  574. .type_size = sizeof(block_q4_1),
  575. .is_quantized = true,
  576. .to_float = (ggml_to_float_t) dequantize_row_q4_1,
  577. .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
  578. },
  579. [4] = { // GGML_TYPE_Q4_2
  580. .type_name = "DEPRECATED",
  581. .blck_size = 0,
  582. .type_size = 0,
  583. .is_quantized = false,
  584. },
  585. [5] = { // GGML_TYPE_Q4_3
  586. .type_name = "DEPRECATED",
  587. .blck_size = 0,
  588. .type_size = 0,
  589. .is_quantized = false,
  590. },
  591. [GGML_TYPE_Q5_0] = {
  592. .type_name = "q5_0",
  593. .blck_size = QK5_0,
  594. .type_size = sizeof(block_q5_0),
  595. .is_quantized = true,
  596. .to_float = (ggml_to_float_t) dequantize_row_q5_0,
  597. .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
  598. },
  599. [GGML_TYPE_Q5_1] = {
  600. .type_name = "q5_1",
  601. .blck_size = QK5_1,
  602. .type_size = sizeof(block_q5_1),
  603. .is_quantized = true,
  604. .to_float = (ggml_to_float_t) dequantize_row_q5_1,
  605. .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
  606. },
  607. [GGML_TYPE_Q8_0] = {
  608. .type_name = "q8_0",
  609. .blck_size = QK8_0,
  610. .type_size = sizeof(block_q8_0),
  611. .is_quantized = true,
  612. .to_float = (ggml_to_float_t) dequantize_row_q8_0,
  613. .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
  614. },
  615. [GGML_TYPE_Q8_1] = {
  616. .type_name = "q8_1",
  617. .blck_size = QK8_1,
  618. .type_size = sizeof(block_q8_1),
  619. .is_quantized = true,
  620. .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
  621. },
  622. [GGML_TYPE_Q2_K] = {
  623. .type_name = "q2_K",
  624. .blck_size = QK_K,
  625. .type_size = sizeof(block_q2_K),
  626. .is_quantized = true,
  627. .to_float = (ggml_to_float_t) dequantize_row_q2_K,
  628. .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
  629. },
  630. [GGML_TYPE_Q3_K] = {
  631. .type_name = "q3_K",
  632. .blck_size = QK_K,
  633. .type_size = sizeof(block_q3_K),
  634. .is_quantized = true,
  635. .to_float = (ggml_to_float_t) dequantize_row_q3_K,
  636. .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
  637. },
  638. [GGML_TYPE_Q4_K] = {
  639. .type_name = "q4_K",
  640. .blck_size = QK_K,
  641. .type_size = sizeof(block_q4_K),
  642. .is_quantized = true,
  643. .to_float = (ggml_to_float_t) dequantize_row_q4_K,
  644. .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
  645. },
  646. [GGML_TYPE_Q5_K] = {
  647. .type_name = "q5_K",
  648. .blck_size = QK_K,
  649. .type_size = sizeof(block_q5_K),
  650. .is_quantized = true,
  651. .to_float = (ggml_to_float_t) dequantize_row_q5_K,
  652. .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
  653. },
  654. [GGML_TYPE_Q6_K] = {
  655. .type_name = "q6_K",
  656. .blck_size = QK_K,
  657. .type_size = sizeof(block_q6_K),
  658. .is_quantized = true,
  659. .to_float = (ggml_to_float_t) dequantize_row_q6_K,
  660. .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
  661. },
  662. [GGML_TYPE_IQ2_XXS] = {
  663. .type_name = "iq2_xxs",
  664. .blck_size = QK_K,
  665. .type_size = sizeof(block_iq2_xxs),
  666. .is_quantized = true,
  667. .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
  668. .from_float_ref = NULL,
  669. },
  670. [GGML_TYPE_IQ2_XS] = {
  671. .type_name = "iq2_xs",
  672. .blck_size = QK_K,
  673. .type_size = sizeof(block_iq2_xs),
  674. .is_quantized = true,
  675. .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
  676. .from_float_ref = NULL,
  677. },
  678. [GGML_TYPE_IQ3_XXS] = {
  679. .type_name = "iq3_xxs",
  680. .blck_size = QK_K,
  681. .type_size = sizeof(block_iq3_xxs),
  682. .is_quantized = true,
  683. .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
  684. .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
  685. },
  686. [GGML_TYPE_IQ3_S] = {
  687. .type_name = "iq3_s",
  688. .blck_size = QK_K,
  689. .type_size = sizeof(block_iq3_s),
  690. .is_quantized = true,
  691. .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
  692. .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
  693. },
  694. [GGML_TYPE_IQ2_S] = {
  695. .type_name = "iq2_s",
  696. .blck_size = QK_K,
  697. .type_size = sizeof(block_iq2_s),
  698. .is_quantized = true,
  699. .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
  700. .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
  701. },
  702. [GGML_TYPE_IQ1_S] = {
  703. .type_name = "iq1_s",
  704. .blck_size = QK_K,
  705. .type_size = sizeof(block_iq1_s),
  706. .is_quantized = true,
  707. .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
  708. .from_float_ref = NULL,
  709. },
  710. [GGML_TYPE_IQ1_M] = {
  711. .type_name = "iq1_m",
  712. .blck_size = QK_K,
  713. .type_size = sizeof(block_iq1_m),
  714. .is_quantized = true,
  715. .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
  716. .from_float_ref = NULL,
  717. },
  718. [GGML_TYPE_IQ4_NL] = {
  719. .type_name = "iq4_nl",
  720. .blck_size = QK4_NL,
  721. .type_size = sizeof(block_iq4_nl),
  722. .is_quantized = true,
  723. .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
  724. .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
  725. },
  726. [GGML_TYPE_IQ4_XS] = {
  727. .type_name = "iq4_xs",
  728. .blck_size = QK_K,
  729. .type_size = sizeof(block_iq4_xs),
  730. .is_quantized = true,
  731. .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
  732. .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
  733. },
  734. [GGML_TYPE_Q8_K] = {
  735. .type_name = "q8_K",
  736. .blck_size = QK_K,
  737. .type_size = sizeof(block_q8_K),
  738. .is_quantized = true,
  739. },
  740. [GGML_TYPE_BF16] = {
  741. .type_name = "bf16",
  742. .blck_size = 1,
  743. .type_size = sizeof(ggml_bf16_t),
  744. .is_quantized = false,
  745. .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
  746. .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
  747. },
  748. [31] = { // GGML_TYPE_Q4_0_4_4
  749. .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
  750. .blck_size = 0,
  751. .type_size = 0,
  752. .is_quantized = false,
  753. },
  754. [32] = { // GGML_TYPE_Q4_0_4_8
  755. .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
  756. .blck_size = 0,
  757. .type_size = 0,
  758. .is_quantized = false,
  759. },
  760. [33] = { // GGML_TYPE_Q4_0_8_8
  761. .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
  762. .blck_size = 0,
  763. .type_size = 0,
  764. .is_quantized = false,
  765. },
  766. [GGML_TYPE_TQ1_0] = {
  767. .type_name = "tq1_0",
  768. .blck_size = QK_K,
  769. .type_size = sizeof(block_tq1_0),
  770. .is_quantized = true,
  771. .to_float = (ggml_to_float_t) dequantize_row_tq1_0,
  772. .from_float_ref = (ggml_from_float_t) quantize_row_tq1_0_ref,
  773. },
  774. [GGML_TYPE_TQ2_0] = {
  775. .type_name = "tq2_0",
  776. .blck_size = QK_K,
  777. .type_size = sizeof(block_tq2_0),
  778. .is_quantized = true,
  779. .to_float = (ggml_to_float_t) dequantize_row_tq2_0,
  780. .from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
  781. },
  782. [36] = { // GGML_TYPE_IQ4_NL_4_4
  783. .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
  784. .blck_size = 0,
  785. .type_size = 0,
  786. .is_quantized = false,
  787. },
  788. [37] = { // GGML_TYPE_IQ4_NL_4_8
  789. .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
  790. .blck_size = 0,
  791. .type_size = 0,
  792. .is_quantized = false,
  793. },
  794. [38] = { // GGML_TYPE_IQ4_NL_8_8
  795. .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
  796. .blck_size = 0,
  797. .type_size = 0,
  798. .is_quantized = false,
  799. },
  800. };
  801. const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
  802. GGML_ASSERT(type < GGML_TYPE_COUNT);
  803. return &type_traits[type];
  804. }
  805. //
  806. // ggml object
  807. //
  808. struct ggml_object {
  809. size_t offs;
  810. size_t size;
  811. struct ggml_object * next;
  812. enum ggml_object_type type;
  813. char padding[4];
  814. };
  815. static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
  816. //
  817. // ggml context
  818. //
  819. struct ggml_context {
  820. size_t mem_size;
  821. void * mem_buffer;
  822. bool mem_buffer_owned;
  823. bool no_alloc;
  824. int n_objects;
  825. struct ggml_object * objects_begin;
  826. struct ggml_object * objects_end;
  827. };
  828. struct ggml_context_container {
  829. bool used;
  830. struct ggml_context context;
  831. };
  832. //
  833. // data types
  834. //
  835. static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
  836. "NONE",
  837. "DUP",
  838. "ADD",
  839. "ADD1",
  840. "ACC",
  841. "SUB",
  842. "MUL",
  843. "DIV",
  844. "SQR",
  845. "SQRT",
  846. "LOG",
  847. "SIN",
  848. "COS",
  849. "SUM",
  850. "SUM_ROWS",
  851. "MEAN",
  852. "ARGMAX",
  853. "COUNT_EQUAL",
  854. "REPEAT",
  855. "REPEAT_BACK",
  856. "CONCAT",
  857. "SILU_BACK",
  858. "NORM",
  859. "RMS_NORM",
  860. "RMS_NORM_BACK",
  861. "GROUP_NORM",
  862. "MUL_MAT",
  863. "MUL_MAT_ID",
  864. "OUT_PROD",
  865. "SCALE",
  866. "SET",
  867. "CPY",
  868. "CONT",
  869. "RESHAPE",
  870. "VIEW",
  871. "PERMUTE",
  872. "TRANSPOSE",
  873. "GET_ROWS",
  874. "GET_ROWS_BACK",
  875. "DIAG",
  876. "DIAG_MASK_INF",
  877. "DIAG_MASK_ZERO",
  878. "SOFT_MAX",
  879. "SOFT_MAX_BACK",
  880. "ROPE",
  881. "ROPE_BACK",
  882. "CLAMP",
  883. "CONV_TRANSPOSE_1D",
  884. "IM2COL",
  885. "IM2COL_BACK",
  886. "CONV_TRANSPOSE_2D",
  887. "POOL_1D",
  888. "POOL_2D",
  889. "POOL_2D_BACK",
  890. "UPSCALE",
  891. "PAD",
  892. "PAD_REFLECT_1D",
  893. "UNPAD",
  894. "ARANGE",
  895. "TIMESTEP_EMBEDDING",
  896. "ARGSORT",
  897. "LEAKY_RELU",
  898. "FLASH_ATTN_EXT",
  899. "FLASH_ATTN_BACK",
  900. "SSM_CONV",
  901. "SSM_SCAN",
  902. "WIN_PART",
  903. "WIN_UNPART",
  904. "GET_REL_POS",
  905. "ADD_REL_POS",
  906. "RWKV_WKV6",
  907. "UNARY",
  908. "MAP_UNARY",
  909. "MAP_BINARY",
  910. "MAP_CUSTOM1_F32",
  911. "MAP_CUSTOM2_F32",
  912. "MAP_CUSTOM3_F32",
  913. "MAP_CUSTOM1",
  914. "MAP_CUSTOM2",
  915. "MAP_CUSTOM3",
  916. "CROSS_ENTROPY_LOSS",
  917. "CROSS_ENTROPY_LOSS_BACK",
  918. "OPT_STEP_ADAMW",
  919. };
  920. static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
  921. static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
  922. "none",
  923. "x",
  924. "x+y",
  925. "x+y",
  926. "view(x,nb,offset)+=y->x",
  927. "x-y",
  928. "x*y",
  929. "x/y",
  930. "x^2",
  931. "√x",
  932. "log(x)",
  933. "sin(x)",
  934. "cos(x)",
  935. "Σx",
  936. "Σx_k",
  937. "Σx/n",
  938. "argmax(x)",
  939. "count_equal(x)",
  940. "repeat(x)",
  941. "repeat_back(x)",
  942. "concat(x, y)",
  943. "silu_back(x)",
  944. "norm(x)",
  945. "rms_norm(x)",
  946. "rms_norm_back(x)",
  947. "group_norm(x)",
  948. "X*Y",
  949. "X[i]*Y",
  950. "X*Y",
  951. "x*v",
  952. "y-\\>view(x)",
  953. "x-\\>y",
  954. "cont(x)",
  955. "reshape(x)",
  956. "view(x)",
  957. "permute(x)",
  958. "transpose(x)",
  959. "get_rows(x)",
  960. "get_rows_back(x)",
  961. "diag(x)",
  962. "diag_mask_inf(x)",
  963. "diag_mask_zero(x)",
  964. "soft_max(x)",
  965. "soft_max_back(x)",
  966. "rope(x)",
  967. "rope_back(x)",
  968. "clamp(x)",
  969. "conv_transpose_1d(x)",
  970. "im2col(x)",
  971. "im2col_back(x)",
  972. "conv_transpose_2d(x)",
  973. "pool_1d(x)",
  974. "pool_2d(x)",
  975. "pool_2d_back(x)",
  976. "upscale(x)",
  977. "pad(x)",
  978. "pad_reflect_1d(x)",
  979. "unpad(x)",
  980. "arange(start, stop, step)",
  981. "timestep_embedding(timesteps, dim, max_period)",
  982. "argsort(x)",
  983. "leaky_relu(x)",
  984. "flash_attn_ext(x)",
  985. "flash_attn_back(x)",
  986. "ssm_conv(x)",
  987. "ssm_scan(x)",
  988. "win_part(x)",
  989. "win_unpart(x)",
  990. "get_rel_pos(x)",
  991. "add_rel_pos(x)",
  992. "rwkv_wkv6(k, v, r, tf, td, s)",
  993. "unary(x)",
  994. "f(x)",
  995. "f(x,y)",
  996. "custom_f32(x)",
  997. "custom_f32(x,y)",
  998. "custom_f32(x,y,z)",
  999. "custom(x)",
  1000. "custom(x,y)",
  1001. "custom(x,y,z)",
  1002. "cross_entropy_loss(x,y)",
  1003. "cross_entropy_loss_back(x,y)",
  1004. "adamw(x)",
  1005. };
  1006. static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83");
  1007. static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
  1008. static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
  1009. "ABS",
  1010. "SGN",
  1011. "NEG",
  1012. "STEP",
  1013. "TANH",
  1014. "ELU",
  1015. "RELU",
  1016. "SIGMOID",
  1017. "GELU",
  1018. "GELU_QUICK",
  1019. "SILU",
  1020. "HARDSWISH",
  1021. "HARDSIGMOID",
  1022. "EXP",
  1023. };
  1024. static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
  1025. static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
  1026. static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
  1027. ////////////////////////////////////////////////////////////////////////////////
  1028. void ggml_print_object(const struct ggml_object * obj) {
  1029. GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
  1030. obj->type, obj->offs, obj->size, (const void *) obj->next);
  1031. }
  1032. void ggml_print_objects(const struct ggml_context * ctx) {
  1033. struct ggml_object * obj = ctx->objects_begin;
  1034. GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
  1035. while (obj != NULL) {
  1036. ggml_print_object(obj);
  1037. obj = obj->next;
  1038. }
  1039. GGML_LOG_INFO("%s: --- end ---\n", __func__);
  1040. }
  1041. int64_t ggml_nelements(const struct ggml_tensor * tensor) {
  1042. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1043. return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
  1044. }
  1045. int64_t ggml_nrows(const struct ggml_tensor * tensor) {
  1046. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1047. return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
  1048. }
  1049. size_t ggml_nbytes(const struct ggml_tensor * tensor) {
  1050. size_t nbytes;
  1051. const size_t blck_size = ggml_blck_size(tensor->type);
  1052. if (blck_size == 1) {
  1053. nbytes = ggml_type_size(tensor->type);
  1054. for (int i = 0; i < GGML_MAX_DIMS; ++i) {
  1055. nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
  1056. }
  1057. }
  1058. else {
  1059. nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
  1060. for (int i = 1; i < GGML_MAX_DIMS; ++i) {
  1061. nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
  1062. }
  1063. }
  1064. return nbytes;
  1065. }
  1066. size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
  1067. return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
  1068. }
  1069. int64_t ggml_blck_size(enum ggml_type type) {
  1070. return type_traits[type].blck_size;
  1071. }
  1072. size_t ggml_type_size(enum ggml_type type) {
  1073. return type_traits[type].type_size;
  1074. }
  1075. size_t ggml_row_size(enum ggml_type type, int64_t ne) {
  1076. assert(ne % ggml_blck_size(type) == 0);
  1077. return ggml_type_size(type)*ne/ggml_blck_size(type);
  1078. }
  1079. double ggml_type_sizef(enum ggml_type type) {
  1080. return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
  1081. }
  1082. const char * ggml_type_name(enum ggml_type type) {
  1083. return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
  1084. }
  1085. bool ggml_is_quantized(enum ggml_type type) {
  1086. return type_traits[type].is_quantized;
  1087. }
  1088. const char * ggml_op_name(enum ggml_op op) {
  1089. return GGML_OP_NAME[op];
  1090. }
  1091. const char * ggml_op_symbol(enum ggml_op op) {
  1092. return GGML_OP_SYMBOL[op];
  1093. }
  1094. const char * ggml_unary_op_name(enum ggml_unary_op op) {
  1095. return GGML_UNARY_OP_NAME[op];
  1096. }
  1097. const char * ggml_op_desc(const struct ggml_tensor * t) {
  1098. if (t->op == GGML_OP_UNARY) {
  1099. enum ggml_unary_op uop = ggml_get_unary_op(t);
  1100. return ggml_unary_op_name(uop);
  1101. }
  1102. return ggml_op_name(t->op);
  1103. }
  1104. size_t ggml_element_size(const struct ggml_tensor * tensor) {
  1105. return ggml_type_size(tensor->type);
  1106. }
  1107. bool ggml_is_scalar(const struct ggml_tensor * tensor) {
  1108. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1109. return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
  1110. }
  1111. bool ggml_is_vector(const struct ggml_tensor * tensor) {
  1112. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1113. return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
  1114. }
  1115. bool ggml_is_matrix(const struct ggml_tensor * tensor) {
  1116. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1117. return tensor->ne[2] == 1 && tensor->ne[3] == 1;
  1118. }
  1119. bool ggml_is_3d(const struct ggml_tensor * tensor) {
  1120. return tensor->ne[3] == 1;
  1121. }
  1122. int ggml_n_dims(const struct ggml_tensor * tensor) {
  1123. for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
  1124. if (tensor->ne[i] > 1) {
  1125. return i + 1;
  1126. }
  1127. }
  1128. return 1;
  1129. }
  1130. enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
  1131. enum ggml_type wtype = GGML_TYPE_COUNT;
  1132. switch (ftype) {
  1133. case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
  1134. case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
  1135. case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break;
  1136. case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
  1137. case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
  1138. case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
  1139. case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
  1140. case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
  1141. case GGML_FTYPE_MOSTLY_Q2_K: wtype = GGML_TYPE_Q2_K; break;
  1142. case GGML_FTYPE_MOSTLY_Q3_K: wtype = GGML_TYPE_Q3_K; break;
  1143. case GGML_FTYPE_MOSTLY_Q4_K: wtype = GGML_TYPE_Q4_K; break;
  1144. case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break;
  1145. case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break;
  1146. case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break;
  1147. case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break;
  1148. case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break;
  1149. case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break;
  1150. case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break;
  1151. case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break;
  1152. case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
  1153. case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break;
  1154. case GGML_FTYPE_MOSTLY_IQ2_S: wtype = GGML_TYPE_IQ2_S; break;
  1155. case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
  1156. case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
  1157. }
  1158. GGML_ASSERT(wtype != GGML_TYPE_COUNT);
  1159. return wtype;
  1160. }
  1161. size_t ggml_tensor_overhead(void) {
  1162. return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
  1163. }
  1164. bool ggml_is_transposed(const struct ggml_tensor * tensor) {
  1165. return tensor->nb[0] > tensor->nb[1];
  1166. }
  1167. static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
  1168. size_t next_nb = ggml_type_size(tensor->type);
  1169. if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
  1170. return false;
  1171. }
  1172. next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
  1173. for (int i = 1; i < GGML_MAX_DIMS; i++) {
  1174. if (tensor->ne[i] != 1) {
  1175. if (i > n) {
  1176. if (tensor->nb[i] != next_nb) {
  1177. return false;
  1178. }
  1179. next_nb *= tensor->ne[i];
  1180. } else {
  1181. // this dimension does not need to be contiguous
  1182. next_nb = tensor->ne[i]*tensor->nb[i];
  1183. }
  1184. }
  1185. }
  1186. return true;
  1187. }
  1188. bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
  1189. return ggml_is_contiguous_0(tensor);
  1190. }
  1191. bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
  1192. return ggml_is_contiguous_n(tensor, 0);
  1193. }
  1194. bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
  1195. return ggml_is_contiguous_n(tensor, 1);
  1196. }
  1197. bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
  1198. return ggml_is_contiguous_n(tensor, 2);
  1199. }
  1200. bool ggml_is_permuted(const struct ggml_tensor * tensor) {
  1201. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1202. return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
  1203. }
  1204. static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
  1205. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1206. return
  1207. tensor->nb[0] == ggml_type_size(tensor->type) &&
  1208. tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
  1209. tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
  1210. }
  1211. bool ggml_is_empty(const struct ggml_tensor * tensor) {
  1212. for (int i = 0; i < GGML_MAX_DIMS; ++i) {
  1213. if (tensor->ne[i] == 0) {
  1214. // empty if any dimension has no elements
  1215. return true;
  1216. }
  1217. }
  1218. return false;
  1219. }
  1220. bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  1221. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1222. return
  1223. (t0->ne[0] == t1->ne[0]) &&
  1224. (t0->ne[1] == t1->ne[1]) &&
  1225. (t0->ne[2] == t1->ne[2]) &&
  1226. (t0->ne[3] == t1->ne[3]);
  1227. }
  1228. bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  1229. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1230. return
  1231. (t0->nb[0] == t1->nb[0]) &&
  1232. (t0->nb[1] == t1->nb[1]) &&
  1233. (t0->nb[2] == t1->nb[2]) &&
  1234. (t0->nb[3] == t1->nb[3]);
  1235. }
  1236. // check if t1 can be represented as a repeatition of t0
  1237. bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  1238. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1239. return ggml_is_empty(t0) ? ggml_is_empty(t1) :
  1240. (t1->ne[0]%t0->ne[0] == 0) &&
  1241. (t1->ne[1]%t0->ne[1] == 0) &&
  1242. (t1->ne[2]%t0->ne[2] == 0) &&
  1243. (t1->ne[3]%t0->ne[3] == 0);
  1244. }
  1245. static inline bool ggml_can_repeat_rows(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  1246. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  1247. return (t0->ne[0] == t1->ne[0]) && ggml_can_repeat(t0, t1);
  1248. }
  1249. // assert that pointer is aligned to GGML_MEM_ALIGN
  1250. #define GGML_ASSERT_ALIGNED(ptr) \
  1251. GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
  1252. ////////////////////////////////////////////////////////////////////////////////
  1253. struct ggml_context * ggml_init(struct ggml_init_params params) {
  1254. static bool is_first_call = true;
  1255. ggml_critical_section_start();
  1256. if (is_first_call) {
  1257. // initialize time system (required on Windows)
  1258. ggml_time_init();
  1259. for (int i = 0; i < (1 << 16); ++i) {
  1260. union {
  1261. uint16_t u16;
  1262. ggml_fp16_t fp16;
  1263. } u = {i};
  1264. ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
  1265. }
  1266. is_first_call = false;
  1267. }
  1268. ggml_critical_section_end();
  1269. struct ggml_context * ctx = GGML_MALLOC(sizeof(struct ggml_context));
  1270. // allow to call ggml_init with 0 size
  1271. if (params.mem_size == 0) {
  1272. params.mem_size = GGML_MEM_ALIGN;
  1273. }
  1274. const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
  1275. *ctx = (struct ggml_context) {
  1276. /*.mem_size =*/ mem_size,
  1277. /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
  1278. /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
  1279. /*.no_alloc =*/ params.no_alloc,
  1280. /*.n_objects =*/ 0,
  1281. /*.objects_begin =*/ NULL,
  1282. /*.objects_end =*/ NULL,
  1283. };
  1284. GGML_ASSERT(ctx->mem_buffer != NULL);
  1285. GGML_ASSERT_ALIGNED(ctx->mem_buffer);
  1286. GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
  1287. return ctx;
  1288. }
  1289. void ggml_reset(struct ggml_context * ctx) {
  1290. if (ctx == NULL) {
  1291. return;
  1292. }
  1293. ctx->n_objects = 0;
  1294. ctx->objects_begin = NULL;
  1295. ctx->objects_end = NULL;
  1296. }
  1297. void ggml_free(struct ggml_context * ctx) {
  1298. if (ctx == NULL) {
  1299. return;
  1300. }
  1301. if (ctx->mem_buffer_owned) {
  1302. ggml_aligned_free(ctx->mem_buffer, ctx->mem_size);
  1303. }
  1304. GGML_FREE(ctx);
  1305. }
  1306. size_t ggml_used_mem(const struct ggml_context * ctx) {
  1307. return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
  1308. }
  1309. bool ggml_get_no_alloc(struct ggml_context * ctx) {
  1310. return ctx->no_alloc;
  1311. }
  1312. void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
  1313. ctx->no_alloc = no_alloc;
  1314. }
  1315. void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
  1316. return ctx->mem_buffer;
  1317. }
  1318. size_t ggml_get_mem_size(const struct ggml_context * ctx) {
  1319. return ctx->mem_size;
  1320. }
  1321. size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
  1322. size_t max_size = 0;
  1323. for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
  1324. size_t bytes = ggml_nbytes(tensor);
  1325. max_size = MAX(max_size, bytes);
  1326. }
  1327. return max_size;
  1328. }
  1329. ////////////////////////////////////////////////////////////////////////////////
  1330. static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
  1331. // always insert objects at the end of the context's memory pool
  1332. struct ggml_object * obj_cur = ctx->objects_end;
  1333. const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
  1334. const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
  1335. const size_t cur_end = cur_offs + cur_size;
  1336. // align to GGML_MEM_ALIGN
  1337. size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
  1338. char * const mem_buffer = ctx->mem_buffer;
  1339. struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
  1340. if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
  1341. GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
  1342. __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
  1343. #ifndef NDEBUG
  1344. GGML_ABORT("not enough space in the context's memory pool");
  1345. #endif
  1346. return NULL;
  1347. }
  1348. *obj_new = (struct ggml_object) {
  1349. .offs = cur_end + GGML_OBJECT_SIZE,
  1350. .size = size_needed,
  1351. .next = NULL,
  1352. .type = type,
  1353. };
  1354. GGML_ASSERT_ALIGNED(mem_buffer + obj_new->offs);
  1355. if (obj_cur != NULL) {
  1356. obj_cur->next = obj_new;
  1357. } else {
  1358. // this is the first object in this context
  1359. ctx->objects_begin = obj_new;
  1360. }
  1361. ctx->objects_end = obj_new;
  1362. //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
  1363. return obj_new;
  1364. }
  1365. static struct ggml_tensor * ggml_new_tensor_impl(
  1366. struct ggml_context * ctx,
  1367. enum ggml_type type,
  1368. int n_dims,
  1369. const int64_t * ne,
  1370. struct ggml_tensor * view_src,
  1371. size_t view_offs) {
  1372. GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
  1373. GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
  1374. // find the base tensor and absolute offset
  1375. if (view_src != NULL && view_src->view_src != NULL) {
  1376. view_offs += view_src->view_offs;
  1377. view_src = view_src->view_src;
  1378. }
  1379. size_t data_size = ggml_row_size(type, ne[0]);
  1380. for (int i = 1; i < n_dims; i++) {
  1381. data_size *= ne[i];
  1382. }
  1383. GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src));
  1384. void * data = view_src != NULL ? view_src->data : NULL;
  1385. if (data != NULL) {
  1386. data = (char *) data + view_offs;
  1387. }
  1388. size_t obj_alloc_size = 0;
  1389. if (view_src == NULL && !ctx->no_alloc) {
  1390. // allocate tensor data in the context's memory pool
  1391. obj_alloc_size = data_size;
  1392. }
  1393. struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
  1394. GGML_ASSERT(obj_new);
  1395. struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
  1396. #ifdef __clang__
  1397. // temporary until ggml_tensor::backend is removed
  1398. #pragma clang diagnostic push
  1399. #pragma clang diagnostic ignored "-Wdeprecated-declarations"
  1400. #endif
  1401. *result = (struct ggml_tensor) {
  1402. /*.type =*/ type,
  1403. /*.backend =*/ GGML_BACKEND_TYPE_CPU,
  1404. /*.buffer =*/ NULL,
  1405. /*.ne =*/ { 1, 1, 1, 1 },
  1406. /*.nb =*/ { 0, 0, 0, 0 },
  1407. /*.op =*/ GGML_OP_NONE,
  1408. /*.op_params =*/ { 0 },
  1409. /*.flags =*/ 0,
  1410. /*.src =*/ { NULL },
  1411. /*.view_src =*/ view_src,
  1412. /*.view_offs =*/ view_offs,
  1413. /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
  1414. /*.name =*/ { 0 },
  1415. /*.extra =*/ NULL,
  1416. /*.padding =*/ { 0 },
  1417. };
  1418. #ifdef __clang__
  1419. #pragma clang diagnostic pop
  1420. #endif
  1421. // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
  1422. //GGML_ASSERT_ALIGNED(result->data);
  1423. for (int i = 0; i < n_dims; i++) {
  1424. result->ne[i] = ne[i];
  1425. }
  1426. result->nb[0] = ggml_type_size(type);
  1427. result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
  1428. for (int i = 2; i < GGML_MAX_DIMS; i++) {
  1429. result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
  1430. }
  1431. ctx->n_objects++;
  1432. return result;
  1433. }
  1434. struct ggml_tensor * ggml_new_tensor(
  1435. struct ggml_context * ctx,
  1436. enum ggml_type type,
  1437. int n_dims,
  1438. const int64_t * ne) {
  1439. return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL, 0);
  1440. }
  1441. struct ggml_tensor * ggml_new_tensor_1d(
  1442. struct ggml_context * ctx,
  1443. enum ggml_type type,
  1444. int64_t ne0) {
  1445. return ggml_new_tensor(ctx, type, 1, &ne0);
  1446. }
  1447. struct ggml_tensor * ggml_new_tensor_2d(
  1448. struct ggml_context * ctx,
  1449. enum ggml_type type,
  1450. int64_t ne0,
  1451. int64_t ne1) {
  1452. const int64_t ne[2] = { ne0, ne1 };
  1453. return ggml_new_tensor(ctx, type, 2, ne);
  1454. }
  1455. struct ggml_tensor * ggml_new_tensor_3d(
  1456. struct ggml_context * ctx,
  1457. enum ggml_type type,
  1458. int64_t ne0,
  1459. int64_t ne1,
  1460. int64_t ne2) {
  1461. const int64_t ne[3] = { ne0, ne1, ne2 };
  1462. return ggml_new_tensor(ctx, type, 3, ne);
  1463. }
  1464. struct ggml_tensor * ggml_new_tensor_4d(
  1465. struct ggml_context * ctx,
  1466. enum ggml_type type,
  1467. int64_t ne0,
  1468. int64_t ne1,
  1469. int64_t ne2,
  1470. int64_t ne3) {
  1471. const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
  1472. return ggml_new_tensor(ctx, type, 4, ne);
  1473. }
  1474. void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes) {
  1475. struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, nbytes);
  1476. return (uint8_t *)ctx->mem_buffer + obj->offs;
  1477. }
  1478. struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
  1479. return ggml_new_tensor(ctx, src->type, GGML_MAX_DIMS, src->ne);
  1480. }
  1481. void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3) {
  1482. const int64_t ne2 = tensor->ne[2];
  1483. const int64_t ne1 = tensor->ne[1];
  1484. const int64_t ne0 = tensor->ne[0];
  1485. const int64_t i3_ = (i/(ne2*ne1*ne0));
  1486. const int64_t i2_ = (i - i3_*ne2*ne1*ne0)/(ne1*ne0);
  1487. const int64_t i1_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0)/ne0;
  1488. const int64_t i0_ = (i - i3_*ne2*ne1*ne0 - i2_*ne1*ne0 - i1_*ne0);
  1489. if (i0) {
  1490. * i0 = i0_;
  1491. }
  1492. if (i1) {
  1493. * i1 = i1_;
  1494. }
  1495. if (i2) {
  1496. * i2 = i2_;
  1497. }
  1498. if (i3) {
  1499. * i3 = i3_;
  1500. }
  1501. }
  1502. void * ggml_get_data(const struct ggml_tensor * tensor) {
  1503. return tensor->data;
  1504. }
  1505. float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
  1506. assert(tensor->type == GGML_TYPE_F32);
  1507. return (float *)(tensor->data);
  1508. }
  1509. enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
  1510. GGML_ASSERT(tensor->op == GGML_OP_UNARY);
  1511. return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
  1512. }
  1513. const char * ggml_get_name(const struct ggml_tensor * tensor) {
  1514. return tensor->name;
  1515. }
  1516. struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
  1517. size_t i;
  1518. for (i = 0; i < sizeof(tensor->name) - 1 && name[i] != '\0'; i++) {
  1519. tensor->name[i] = name[i];
  1520. }
  1521. tensor->name[i] = '\0';
  1522. return tensor;
  1523. }
  1524. struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
  1525. va_list args;
  1526. va_start(args, fmt);
  1527. vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
  1528. va_end(args);
  1529. return tensor;
  1530. }
  1531. struct ggml_tensor * ggml_view_tensor(
  1532. struct ggml_context * ctx,
  1533. struct ggml_tensor * src) {
  1534. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, GGML_MAX_DIMS, src->ne, src, 0);
  1535. ggml_format_name(result, "%s (view)", src->name);
  1536. for (int i = 0; i < GGML_MAX_DIMS; i++) {
  1537. result->nb[i] = src->nb[i];
  1538. }
  1539. return result;
  1540. }
  1541. struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
  1542. struct ggml_object * obj = ctx->objects_begin;
  1543. char * const mem_buffer = ctx->mem_buffer;
  1544. while (obj != NULL) {
  1545. if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
  1546. return (struct ggml_tensor *)(mem_buffer + obj->offs);
  1547. }
  1548. obj = obj->next;
  1549. }
  1550. return NULL;
  1551. }
  1552. struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
  1553. struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
  1554. obj = obj->next;
  1555. char * const mem_buffer = ctx->mem_buffer;
  1556. while (obj != NULL) {
  1557. if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
  1558. return (struct ggml_tensor *)(mem_buffer + obj->offs);
  1559. }
  1560. obj = obj->next;
  1561. }
  1562. return NULL;
  1563. }
  1564. struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
  1565. struct ggml_object * obj = ctx->objects_begin;
  1566. char * const mem_buffer = ctx->mem_buffer;
  1567. while (obj != NULL) {
  1568. if (obj->type == GGML_OBJECT_TYPE_TENSOR) {
  1569. struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
  1570. if (strcmp(cur->name, name) == 0) {
  1571. return cur;
  1572. }
  1573. }
  1574. obj = obj->next;
  1575. }
  1576. return NULL;
  1577. }
  1578. ////////////////////////////////////////////////////////////////////////////////
  1579. // ggml_dup
  1580. static struct ggml_tensor * ggml_dup_impl(
  1581. struct ggml_context * ctx,
  1582. struct ggml_tensor * a,
  1583. bool inplace) {
  1584. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1585. result->op = GGML_OP_DUP;
  1586. result->src[0] = a;
  1587. return result;
  1588. }
  1589. struct ggml_tensor * ggml_dup(
  1590. struct ggml_context * ctx,
  1591. struct ggml_tensor * a) {
  1592. return ggml_dup_impl(ctx, a, false);
  1593. }
  1594. struct ggml_tensor * ggml_dup_inplace(
  1595. struct ggml_context * ctx,
  1596. struct ggml_tensor * a) {
  1597. return ggml_dup_impl(ctx, a, true);
  1598. }
  1599. // ggml_add
  1600. static struct ggml_tensor * ggml_add_impl(
  1601. struct ggml_context * ctx,
  1602. struct ggml_tensor * a,
  1603. struct ggml_tensor * b,
  1604. bool inplace) {
  1605. GGML_ASSERT(ggml_can_repeat(b, a));
  1606. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1607. result->op = GGML_OP_ADD;
  1608. result->src[0] = a;
  1609. result->src[1] = b;
  1610. return result;
  1611. }
  1612. struct ggml_tensor * ggml_add(
  1613. struct ggml_context * ctx,
  1614. struct ggml_tensor * a,
  1615. struct ggml_tensor * b) {
  1616. return ggml_add_impl(ctx, a, b, false);
  1617. }
  1618. struct ggml_tensor * ggml_add_inplace(
  1619. struct ggml_context * ctx,
  1620. struct ggml_tensor * a,
  1621. struct ggml_tensor * b) {
  1622. return ggml_add_impl(ctx, a, b, true);
  1623. }
  1624. // ggml_add_cast
  1625. static struct ggml_tensor * ggml_add_cast_impl(
  1626. struct ggml_context * ctx,
  1627. struct ggml_tensor * a,
  1628. struct ggml_tensor * b,
  1629. enum ggml_type type) {
  1630. // TODO: support less-strict constraint
  1631. // GGML_ASSERT(ggml_can_repeat(b, a));
  1632. GGML_ASSERT(ggml_can_repeat_rows(b, a));
  1633. // currently only supported for quantized input and f16
  1634. GGML_ASSERT(ggml_is_quantized(a->type) ||
  1635. a->type == GGML_TYPE_F16 ||
  1636. a->type == GGML_TYPE_BF16);
  1637. struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
  1638. result->op = GGML_OP_ADD;
  1639. result->src[0] = a;
  1640. result->src[1] = b;
  1641. return result;
  1642. }
  1643. struct ggml_tensor * ggml_add_cast(
  1644. struct ggml_context * ctx,
  1645. struct ggml_tensor * a,
  1646. struct ggml_tensor * b,
  1647. enum ggml_type type) {
  1648. return ggml_add_cast_impl(ctx, a, b, type);
  1649. }
  1650. // ggml_add1
  1651. static struct ggml_tensor * ggml_add1_impl(
  1652. struct ggml_context * ctx,
  1653. struct ggml_tensor * a,
  1654. struct ggml_tensor * b,
  1655. bool inplace) {
  1656. GGML_ASSERT(ggml_is_scalar(b));
  1657. GGML_ASSERT(ggml_is_padded_1d(a));
  1658. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1659. result->op = GGML_OP_ADD1;
  1660. result->src[0] = a;
  1661. result->src[1] = b;
  1662. return result;
  1663. }
  1664. struct ggml_tensor * ggml_add1(
  1665. struct ggml_context * ctx,
  1666. struct ggml_tensor * a,
  1667. struct ggml_tensor * b) {
  1668. return ggml_add1_impl(ctx, a, b, false);
  1669. }
  1670. struct ggml_tensor * ggml_add1_inplace(
  1671. struct ggml_context * ctx,
  1672. struct ggml_tensor * a,
  1673. struct ggml_tensor * b) {
  1674. return ggml_add1_impl(ctx, a, b, true);
  1675. }
  1676. // ggml_acc
  1677. static struct ggml_tensor * ggml_acc_impl(
  1678. struct ggml_context * ctx,
  1679. struct ggml_tensor * a,
  1680. struct ggml_tensor * b,
  1681. size_t nb1,
  1682. size_t nb2,
  1683. size_t nb3,
  1684. size_t offset,
  1685. bool inplace) {
  1686. GGML_ASSERT(ggml_nelements(b) <= ggml_nelements(a));
  1687. GGML_ASSERT(ggml_is_contiguous(a));
  1688. GGML_ASSERT(a->type == GGML_TYPE_F32);
  1689. GGML_ASSERT(b->type == GGML_TYPE_F32);
  1690. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1691. int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
  1692. ggml_set_op_params(result, params, sizeof(params));
  1693. result->op = GGML_OP_ACC;
  1694. result->src[0] = a;
  1695. result->src[1] = b;
  1696. return result;
  1697. }
  1698. struct ggml_tensor * ggml_acc(
  1699. struct ggml_context * ctx,
  1700. struct ggml_tensor * a,
  1701. struct ggml_tensor * b,
  1702. size_t nb1,
  1703. size_t nb2,
  1704. size_t nb3,
  1705. size_t offset) {
  1706. return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
  1707. }
  1708. struct ggml_tensor * ggml_acc_inplace(
  1709. struct ggml_context * ctx,
  1710. struct ggml_tensor * a,
  1711. struct ggml_tensor * b,
  1712. size_t nb1,
  1713. size_t nb2,
  1714. size_t nb3,
  1715. size_t offset) {
  1716. return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
  1717. }
  1718. // ggml_sub
  1719. static struct ggml_tensor * ggml_sub_impl(
  1720. struct ggml_context * ctx,
  1721. struct ggml_tensor * a,
  1722. struct ggml_tensor * b,
  1723. bool inplace) {
  1724. GGML_ASSERT(ggml_can_repeat(b, a));
  1725. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1726. result->op = GGML_OP_SUB;
  1727. result->src[0] = a;
  1728. result->src[1] = b;
  1729. return result;
  1730. }
  1731. struct ggml_tensor * ggml_sub(
  1732. struct ggml_context * ctx,
  1733. struct ggml_tensor * a,
  1734. struct ggml_tensor * b) {
  1735. return ggml_sub_impl(ctx, a, b, false);
  1736. }
  1737. struct ggml_tensor * ggml_sub_inplace(
  1738. struct ggml_context * ctx,
  1739. struct ggml_tensor * a,
  1740. struct ggml_tensor * b) {
  1741. return ggml_sub_impl(ctx, a, b, true);
  1742. }
  1743. // ggml_mul
  1744. static struct ggml_tensor * ggml_mul_impl(
  1745. struct ggml_context * ctx,
  1746. struct ggml_tensor * a,
  1747. struct ggml_tensor * b,
  1748. bool inplace) {
  1749. GGML_ASSERT(ggml_can_repeat(b, a));
  1750. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1751. result->op = GGML_OP_MUL;
  1752. result->src[0] = a;
  1753. result->src[1] = b;
  1754. return result;
  1755. }
  1756. struct ggml_tensor * ggml_mul(
  1757. struct ggml_context * ctx,
  1758. struct ggml_tensor * a,
  1759. struct ggml_tensor * b) {
  1760. return ggml_mul_impl(ctx, a, b, false);
  1761. }
  1762. struct ggml_tensor * ggml_mul_inplace(
  1763. struct ggml_context * ctx,
  1764. struct ggml_tensor * a,
  1765. struct ggml_tensor * b) {
  1766. return ggml_mul_impl(ctx, a, b, true);
  1767. }
  1768. // ggml_div
  1769. static struct ggml_tensor * ggml_div_impl(
  1770. struct ggml_context * ctx,
  1771. struct ggml_tensor * a,
  1772. struct ggml_tensor * b,
  1773. bool inplace) {
  1774. GGML_ASSERT(ggml_can_repeat(b, a));
  1775. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1776. result->op = GGML_OP_DIV;
  1777. result->src[0] = a;
  1778. result->src[1] = b;
  1779. return result;
  1780. }
  1781. struct ggml_tensor * ggml_div(
  1782. struct ggml_context * ctx,
  1783. struct ggml_tensor * a,
  1784. struct ggml_tensor * b) {
  1785. return ggml_div_impl(ctx, a, b, false);
  1786. }
  1787. struct ggml_tensor * ggml_div_inplace(
  1788. struct ggml_context * ctx,
  1789. struct ggml_tensor * a,
  1790. struct ggml_tensor * b) {
  1791. return ggml_div_impl(ctx, a, b, true);
  1792. }
  1793. // ggml_sqr
  1794. static struct ggml_tensor * ggml_sqr_impl(
  1795. struct ggml_context * ctx,
  1796. struct ggml_tensor * a,
  1797. bool inplace) {
  1798. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1799. result->op = GGML_OP_SQR;
  1800. result->src[0] = a;
  1801. return result;
  1802. }
  1803. struct ggml_tensor * ggml_sqr(
  1804. struct ggml_context * ctx,
  1805. struct ggml_tensor * a) {
  1806. return ggml_sqr_impl(ctx, a, false);
  1807. }
  1808. struct ggml_tensor * ggml_sqr_inplace(
  1809. struct ggml_context * ctx,
  1810. struct ggml_tensor * a) {
  1811. return ggml_sqr_impl(ctx, a, true);
  1812. }
  1813. // ggml_sqrt
  1814. static struct ggml_tensor * ggml_sqrt_impl(
  1815. struct ggml_context * ctx,
  1816. struct ggml_tensor * a,
  1817. bool inplace) {
  1818. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1819. result->op = GGML_OP_SQRT;
  1820. result->src[0] = a;
  1821. return result;
  1822. }
  1823. struct ggml_tensor * ggml_sqrt(
  1824. struct ggml_context * ctx,
  1825. struct ggml_tensor * a) {
  1826. return ggml_sqrt_impl(ctx, a, false);
  1827. }
  1828. struct ggml_tensor * ggml_sqrt_inplace(
  1829. struct ggml_context * ctx,
  1830. struct ggml_tensor * a) {
  1831. return ggml_sqrt_impl(ctx, a, true);
  1832. }
  1833. // ggml_log
  1834. static struct ggml_tensor * ggml_log_impl(
  1835. struct ggml_context * ctx,
  1836. struct ggml_tensor * a,
  1837. bool inplace) {
  1838. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1839. result->op = GGML_OP_LOG;
  1840. result->src[0] = a;
  1841. return result;
  1842. }
  1843. struct ggml_tensor * ggml_log(
  1844. struct ggml_context * ctx,
  1845. struct ggml_tensor * a) {
  1846. return ggml_log_impl(ctx, a, false);
  1847. }
  1848. struct ggml_tensor * ggml_log_inplace(
  1849. struct ggml_context * ctx,
  1850. struct ggml_tensor * a) {
  1851. return ggml_log_impl(ctx, a, true);
  1852. }
  1853. // ggml_sin
  1854. static struct ggml_tensor * ggml_sin_impl(
  1855. struct ggml_context * ctx,
  1856. struct ggml_tensor * a,
  1857. bool inplace) {
  1858. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1859. result->op = GGML_OP_SIN;
  1860. result->src[0] = a;
  1861. return result;
  1862. }
  1863. struct ggml_tensor * ggml_sin(
  1864. struct ggml_context * ctx,
  1865. struct ggml_tensor * a) {
  1866. return ggml_sin_impl(ctx, a, false);
  1867. }
  1868. struct ggml_tensor * ggml_sin_inplace(
  1869. struct ggml_context * ctx,
  1870. struct ggml_tensor * a) {
  1871. return ggml_sin_impl(ctx, a, true);
  1872. }
  1873. // ggml_cos
  1874. static struct ggml_tensor * ggml_cos_impl(
  1875. struct ggml_context * ctx,
  1876. struct ggml_tensor * a,
  1877. bool inplace) {
  1878. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  1879. result->op = GGML_OP_COS;
  1880. result->src[0] = a;
  1881. return result;
  1882. }
  1883. struct ggml_tensor * ggml_cos(
  1884. struct ggml_context * ctx,
  1885. struct ggml_tensor * a) {
  1886. return ggml_cos_impl(ctx, a, false);
  1887. }
  1888. struct ggml_tensor * ggml_cos_inplace(
  1889. struct ggml_context * ctx,
  1890. struct ggml_tensor * a) {
  1891. return ggml_cos_impl(ctx, a, true);
  1892. }
  1893. // ggml_sum
  1894. struct ggml_tensor * ggml_sum(
  1895. struct ggml_context * ctx,
  1896. struct ggml_tensor * a) {
  1897. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
  1898. result->op = GGML_OP_SUM;
  1899. result->src[0] = a;
  1900. return result;
  1901. }
  1902. // ggml_sum_rows
  1903. struct ggml_tensor * ggml_sum_rows(
  1904. struct ggml_context * ctx,
  1905. struct ggml_tensor * a) {
  1906. int64_t ne[GGML_MAX_DIMS] = { 1 };
  1907. for (int i = 1; i < GGML_MAX_DIMS; ++i) {
  1908. ne[i] = a->ne[i];
  1909. }
  1910. struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
  1911. result->op = GGML_OP_SUM_ROWS;
  1912. result->src[0] = a;
  1913. return result;
  1914. }
  1915. // ggml_mean
  1916. struct ggml_tensor * ggml_mean(
  1917. struct ggml_context * ctx,
  1918. struct ggml_tensor * a) {
  1919. int64_t ne[4] = { 1, a->ne[1], a->ne[2], a->ne[3] };
  1920. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  1921. result->op = GGML_OP_MEAN;
  1922. result->src[0] = a;
  1923. return result;
  1924. }
  1925. // ggml_argmax
  1926. struct ggml_tensor * ggml_argmax(
  1927. struct ggml_context * ctx,
  1928. struct ggml_tensor * a) {
  1929. GGML_ASSERT(ggml_is_matrix(a));
  1930. GGML_ASSERT(a->ne[0] <= INT32_MAX);
  1931. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, a->ne[1]);
  1932. result->op = GGML_OP_ARGMAX;
  1933. result->src[0] = a;
  1934. return result;
  1935. }
  1936. // ggml_count_equal
  1937. struct ggml_tensor * ggml_count_equal(
  1938. struct ggml_context * ctx,
  1939. struct ggml_tensor * a,
  1940. struct ggml_tensor * b) {
  1941. GGML_ASSERT(ggml_are_same_shape(a, b));
  1942. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1);
  1943. result->op = GGML_OP_COUNT_EQUAL;
  1944. result->src[0] = a;
  1945. result->src[1] = b;
  1946. return result;
  1947. }
  1948. // ggml_repeat
  1949. struct ggml_tensor * ggml_repeat(
  1950. struct ggml_context * ctx,
  1951. struct ggml_tensor * a,
  1952. struct ggml_tensor * b) {
  1953. GGML_ASSERT(ggml_can_repeat(a, b));
  1954. struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
  1955. result->op = GGML_OP_REPEAT;
  1956. result->src[0] = a;
  1957. return result;
  1958. }
  1959. // ggml_repeat_back
  1960. struct ggml_tensor * ggml_repeat_back(
  1961. struct ggml_context * ctx,
  1962. struct ggml_tensor * a,
  1963. struct ggml_tensor * b) {
  1964. GGML_ASSERT(ggml_can_repeat(b, a));
  1965. struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, b->ne);
  1966. result->op = GGML_OP_REPEAT_BACK;
  1967. result->src[0] = a;
  1968. return result;
  1969. }
  1970. // ggml_concat
  1971. struct ggml_tensor * ggml_concat(
  1972. struct ggml_context * ctx,
  1973. struct ggml_tensor * a,
  1974. struct ggml_tensor * b,
  1975. int dim) {
  1976. GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
  1977. int64_t ne[GGML_MAX_DIMS];
  1978. for (int d = 0; d < GGML_MAX_DIMS; ++d) {
  1979. if (d == dim) {
  1980. ne[d] = a->ne[d] + b->ne[d];
  1981. continue;
  1982. }
  1983. GGML_ASSERT(a->ne[d] == b->ne[d]);
  1984. ne[d] = a->ne[d];
  1985. }
  1986. struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
  1987. ggml_set_op_params_i32(result, 0, dim);
  1988. result->op = GGML_OP_CONCAT;
  1989. result->src[0] = a;
  1990. result->src[1] = b;
  1991. return result;
  1992. }
  1993. // ggml_abs
  1994. struct ggml_tensor * ggml_abs(
  1995. struct ggml_context * ctx,
  1996. struct ggml_tensor * a) {
  1997. return ggml_unary(ctx, a, GGML_UNARY_OP_ABS);
  1998. }
  1999. struct ggml_tensor * ggml_abs_inplace(
  2000. struct ggml_context * ctx,
  2001. struct ggml_tensor * a) {
  2002. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
  2003. }
  2004. // ggml_sgn
  2005. struct ggml_tensor * ggml_sgn(
  2006. struct ggml_context * ctx,
  2007. struct ggml_tensor * a) {
  2008. return ggml_unary(ctx, a, GGML_UNARY_OP_SGN);
  2009. }
  2010. struct ggml_tensor * ggml_sgn_inplace(
  2011. struct ggml_context * ctx,
  2012. struct ggml_tensor * a) {
  2013. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SGN);
  2014. }
  2015. // ggml_neg
  2016. struct ggml_tensor * ggml_neg(
  2017. struct ggml_context * ctx,
  2018. struct ggml_tensor * a) {
  2019. return ggml_unary(ctx, a, GGML_UNARY_OP_NEG);
  2020. }
  2021. struct ggml_tensor * ggml_neg_inplace(
  2022. struct ggml_context * ctx,
  2023. struct ggml_tensor * a) {
  2024. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_NEG);
  2025. }
  2026. // ggml_step
  2027. struct ggml_tensor * ggml_step(
  2028. struct ggml_context * ctx,
  2029. struct ggml_tensor * a) {
  2030. return ggml_unary(ctx, a, GGML_UNARY_OP_STEP);
  2031. }
  2032. struct ggml_tensor * ggml_step_inplace(
  2033. struct ggml_context * ctx,
  2034. struct ggml_tensor * a) {
  2035. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_STEP);
  2036. }
  2037. // ggml_tanh
  2038. struct ggml_tensor * ggml_tanh(
  2039. struct ggml_context * ctx,
  2040. struct ggml_tensor * a) {
  2041. return ggml_unary(ctx, a, GGML_UNARY_OP_TANH);
  2042. }
  2043. struct ggml_tensor * ggml_tanh_inplace(
  2044. struct ggml_context * ctx,
  2045. struct ggml_tensor * a) {
  2046. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_TANH);
  2047. }
  2048. // ggml_elu
  2049. struct ggml_tensor * ggml_elu(
  2050. struct ggml_context * ctx,
  2051. struct ggml_tensor * a) {
  2052. return ggml_unary(ctx, a, GGML_UNARY_OP_ELU);
  2053. }
  2054. struct ggml_tensor * ggml_elu_inplace(
  2055. struct ggml_context * ctx,
  2056. struct ggml_tensor * a) {
  2057. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ELU);
  2058. }
  2059. // ggml_relu
  2060. struct ggml_tensor * ggml_relu(
  2061. struct ggml_context * ctx,
  2062. struct ggml_tensor * a) {
  2063. return ggml_unary(ctx, a, GGML_UNARY_OP_RELU);
  2064. }
  2065. struct ggml_tensor * ggml_relu_inplace(
  2066. struct ggml_context * ctx,
  2067. struct ggml_tensor * a) {
  2068. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_RELU);
  2069. }
  2070. // ggml_leaky_relu
  2071. struct ggml_tensor * ggml_leaky_relu(
  2072. struct ggml_context * ctx,
  2073. struct ggml_tensor * a,
  2074. float negative_slope,
  2075. bool inplace) {
  2076. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2077. ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
  2078. result->op = GGML_OP_LEAKY_RELU;
  2079. result->src[0] = a;
  2080. return result;
  2081. }
  2082. // ggml_sigmoid
  2083. struct ggml_tensor * ggml_sigmoid(
  2084. struct ggml_context * ctx,
  2085. struct ggml_tensor * a) {
  2086. return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
  2087. }
  2088. struct ggml_tensor * ggml_sigmoid_inplace(
  2089. struct ggml_context * ctx,
  2090. struct ggml_tensor * a) {
  2091. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
  2092. }
  2093. // ggml_gelu
  2094. struct ggml_tensor * ggml_gelu(
  2095. struct ggml_context * ctx,
  2096. struct ggml_tensor * a) {
  2097. return ggml_unary(ctx, a, GGML_UNARY_OP_GELU);
  2098. }
  2099. struct ggml_tensor * ggml_gelu_inplace(
  2100. struct ggml_context * ctx,
  2101. struct ggml_tensor * a) {
  2102. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
  2103. }
  2104. // ggml_gelu_quick
  2105. struct ggml_tensor * ggml_gelu_quick(
  2106. struct ggml_context * ctx,
  2107. struct ggml_tensor * a) {
  2108. return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_QUICK);
  2109. }
  2110. struct ggml_tensor * ggml_gelu_quick_inplace(
  2111. struct ggml_context * ctx,
  2112. struct ggml_tensor * a) {
  2113. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_QUICK);
  2114. }
  2115. // ggml_silu
  2116. struct ggml_tensor * ggml_silu(
  2117. struct ggml_context * ctx,
  2118. struct ggml_tensor * a) {
  2119. return ggml_unary(ctx, a, GGML_UNARY_OP_SILU);
  2120. }
  2121. struct ggml_tensor * ggml_silu_inplace(
  2122. struct ggml_context * ctx,
  2123. struct ggml_tensor * a) {
  2124. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SILU);
  2125. }
  2126. // ggml_silu_back
  2127. struct ggml_tensor * ggml_silu_back(
  2128. struct ggml_context * ctx,
  2129. struct ggml_tensor * a,
  2130. struct ggml_tensor * b) {
  2131. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  2132. result->op = GGML_OP_SILU_BACK;
  2133. result->src[0] = a;
  2134. result->src[1] = b;
  2135. return result;
  2136. }
  2137. // ggml hardswish
  2138. struct ggml_tensor * ggml_hardswish(
  2139. struct ggml_context * ctx,
  2140. struct ggml_tensor * a) {
  2141. return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
  2142. }
  2143. // ggml hardsigmoid
  2144. struct ggml_tensor * ggml_hardsigmoid(
  2145. struct ggml_context * ctx,
  2146. struct ggml_tensor * a) {
  2147. return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
  2148. }
  2149. // ggml exp
  2150. struct ggml_tensor * ggml_exp(
  2151. struct ggml_context * ctx,
  2152. struct ggml_tensor * a) {
  2153. return ggml_unary(ctx, a, GGML_UNARY_OP_EXP);
  2154. }
  2155. struct ggml_tensor * ggml_exp_inplace(
  2156. struct ggml_context * ctx,
  2157. struct ggml_tensor * a) {
  2158. return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP);
  2159. }
  2160. // ggml_norm
  2161. static struct ggml_tensor * ggml_norm_impl(
  2162. struct ggml_context * ctx,
  2163. struct ggml_tensor * a,
  2164. float eps,
  2165. bool inplace) {
  2166. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2167. ggml_set_op_params(result, &eps, sizeof(eps));
  2168. result->op = GGML_OP_NORM;
  2169. result->src[0] = a;
  2170. return result;
  2171. }
  2172. struct ggml_tensor * ggml_norm(
  2173. struct ggml_context * ctx,
  2174. struct ggml_tensor * a,
  2175. float eps) {
  2176. return ggml_norm_impl(ctx, a, eps, false);
  2177. }
  2178. struct ggml_tensor * ggml_norm_inplace(
  2179. struct ggml_context * ctx,
  2180. struct ggml_tensor * a,
  2181. float eps) {
  2182. return ggml_norm_impl(ctx, a, eps, true);
  2183. }
  2184. // ggml_rms_norm
  2185. static struct ggml_tensor * ggml_rms_norm_impl(
  2186. struct ggml_context * ctx,
  2187. struct ggml_tensor * a,
  2188. float eps,
  2189. bool inplace) {
  2190. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2191. ggml_set_op_params(result, &eps, sizeof(eps));
  2192. result->op = GGML_OP_RMS_NORM;
  2193. result->src[0] = a;
  2194. return result;
  2195. }
  2196. struct ggml_tensor * ggml_rms_norm(
  2197. struct ggml_context * ctx,
  2198. struct ggml_tensor * a,
  2199. float eps) {
  2200. return ggml_rms_norm_impl(ctx, a, eps, false);
  2201. }
  2202. struct ggml_tensor * ggml_rms_norm_inplace(
  2203. struct ggml_context * ctx,
  2204. struct ggml_tensor * a,
  2205. float eps) {
  2206. return ggml_rms_norm_impl(ctx, a, eps, true);
  2207. }
  2208. // ggml_rms_norm_back
  2209. struct ggml_tensor * ggml_rms_norm_back(
  2210. struct ggml_context * ctx,
  2211. struct ggml_tensor * a,
  2212. struct ggml_tensor * b,
  2213. float eps) {
  2214. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  2215. ggml_set_op_params(result, &eps, sizeof(eps));
  2216. result->op = GGML_OP_RMS_NORM_BACK;
  2217. result->src[0] = a;
  2218. result->src[1] = b;
  2219. return result;
  2220. }
  2221. // ggml_group_norm
  2222. static struct ggml_tensor * ggml_group_norm_impl(
  2223. struct ggml_context * ctx,
  2224. struct ggml_tensor * a,
  2225. int n_groups,
  2226. float eps,
  2227. bool inplace) {
  2228. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2229. ggml_set_op_params_i32(result, 0, n_groups);
  2230. ggml_set_op_params_f32(result, 1, eps);
  2231. result->op = GGML_OP_GROUP_NORM;
  2232. result->src[0] = a;
  2233. return result;
  2234. }
  2235. struct ggml_tensor * ggml_group_norm(
  2236. struct ggml_context * ctx,
  2237. struct ggml_tensor * a,
  2238. int n_groups,
  2239. float eps) {
  2240. return ggml_group_norm_impl(ctx, a, n_groups, eps, false);
  2241. }
  2242. struct ggml_tensor * ggml_group_norm_inplace(
  2243. struct ggml_context * ctx,
  2244. struct ggml_tensor * a,
  2245. int n_groups,
  2246. float eps) {
  2247. return ggml_group_norm_impl(ctx, a, n_groups, eps, true);
  2248. }
  2249. // ggml_mul_mat
  2250. static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  2251. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  2252. return (t0->ne[0] == t1->ne[0]) &&
  2253. (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
  2254. (t1->ne[3]%t0->ne[3] == 0);
  2255. }
  2256. struct ggml_tensor * ggml_mul_mat(
  2257. struct ggml_context * ctx,
  2258. struct ggml_tensor * a,
  2259. struct ggml_tensor * b) {
  2260. GGML_ASSERT(ggml_can_mul_mat(a, b));
  2261. GGML_ASSERT(!ggml_is_transposed(a));
  2262. const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
  2263. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  2264. result->op = GGML_OP_MUL_MAT;
  2265. result->src[0] = a;
  2266. result->src[1] = b;
  2267. return result;
  2268. }
  2269. void ggml_mul_mat_set_prec(
  2270. struct ggml_tensor * a,
  2271. enum ggml_prec prec) {
  2272. GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
  2273. const int32_t prec_i32 = (int32_t) prec;
  2274. ggml_set_op_params_i32(a, 0, prec_i32);
  2275. }
  2276. // ggml_mul_mat_id
  2277. /*
  2278. c = ggml_mul_mat_id(ctx, as, b, ids);
  2279. as -> [cols, rows, n_expert]
  2280. ids -> [n_experts_used, n_tokens] (i32)
  2281. b -> [cols, n_expert_used, n_tokens]
  2282. c -> [rows, n_expert_used, n_tokens]
  2283. in b, n_experts_used can be broadcasted to match the n_expert_used of ids
  2284. c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
  2285. */
  2286. struct ggml_tensor * ggml_mul_mat_id(
  2287. struct ggml_context * ctx,
  2288. struct ggml_tensor * as,
  2289. struct ggml_tensor * b,
  2290. struct ggml_tensor * ids) {
  2291. GGML_ASSERT(!ggml_is_transposed(as));
  2292. GGML_ASSERT(ids->type == GGML_TYPE_I32);
  2293. GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert)
  2294. GGML_ASSERT(b->ne[3] == 1); // b is 3d
  2295. GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d
  2296. GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row
  2297. GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat
  2298. GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast
  2299. const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
  2300. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  2301. result->op = GGML_OP_MUL_MAT_ID;
  2302. result->src[0] = as;
  2303. result->src[1] = b;
  2304. result->src[2] = ids;
  2305. return result;
  2306. }
  2307. // ggml_out_prod
  2308. static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  2309. static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
  2310. return (t0->ne[1] == t1->ne[1]) &&
  2311. (t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
  2312. (t1->ne[3]%t0->ne[3] == 0);
  2313. }
  2314. struct ggml_tensor * ggml_out_prod(
  2315. struct ggml_context * ctx,
  2316. struct ggml_tensor * a,
  2317. struct ggml_tensor * b) {
  2318. GGML_ASSERT(ggml_can_out_prod(a, b));
  2319. GGML_ASSERT(!ggml_is_transposed(a));
  2320. // a is broadcastable to b for ne[2] and ne[3] -> use b->ne[2] and b->ne[3]
  2321. const int64_t ne[4] = { a->ne[0], b->ne[0], b->ne[2], b->ne[3] };
  2322. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  2323. result->op = GGML_OP_OUT_PROD;
  2324. result->src[0] = a;
  2325. result->src[1] = b;
  2326. return result;
  2327. }
  2328. // ggml_scale
  2329. static struct ggml_tensor * ggml_scale_impl(
  2330. struct ggml_context * ctx,
  2331. struct ggml_tensor * a,
  2332. float s,
  2333. bool inplace) {
  2334. GGML_ASSERT(ggml_is_padded_1d(a));
  2335. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2336. ggml_set_op_params(result, &s, sizeof(s));
  2337. result->op = GGML_OP_SCALE;
  2338. result->src[0] = a;
  2339. return result;
  2340. }
  2341. struct ggml_tensor * ggml_scale(
  2342. struct ggml_context * ctx,
  2343. struct ggml_tensor * a,
  2344. float s) {
  2345. return ggml_scale_impl(ctx, a, s, false);
  2346. }
  2347. struct ggml_tensor * ggml_scale_inplace(
  2348. struct ggml_context * ctx,
  2349. struct ggml_tensor * a,
  2350. float s) {
  2351. return ggml_scale_impl(ctx, a, s, true);
  2352. }
  2353. // ggml_set
  2354. static struct ggml_tensor * ggml_set_impl(
  2355. struct ggml_context * ctx,
  2356. struct ggml_tensor * a,
  2357. struct ggml_tensor * b,
  2358. size_t nb1,
  2359. size_t nb2,
  2360. size_t nb3,
  2361. size_t offset,
  2362. bool inplace) {
  2363. GGML_ASSERT(ggml_nelements(a) >= ggml_nelements(b));
  2364. // make a view of the destination
  2365. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2366. GGML_ASSERT(offset < (size_t)(1 << 30));
  2367. int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
  2368. ggml_set_op_params(result, params, sizeof(params));
  2369. result->op = GGML_OP_SET;
  2370. result->src[0] = a;
  2371. result->src[1] = b;
  2372. return result;
  2373. }
  2374. struct ggml_tensor * ggml_set(
  2375. struct ggml_context * ctx,
  2376. struct ggml_tensor * a,
  2377. struct ggml_tensor * b,
  2378. size_t nb1,
  2379. size_t nb2,
  2380. size_t nb3,
  2381. size_t offset) {
  2382. return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, false);
  2383. }
  2384. struct ggml_tensor * ggml_set_inplace(
  2385. struct ggml_context * ctx,
  2386. struct ggml_tensor * a,
  2387. struct ggml_tensor * b,
  2388. size_t nb1,
  2389. size_t nb2,
  2390. size_t nb3,
  2391. size_t offset) {
  2392. return ggml_set_impl(ctx, a, b, nb1, nb2, nb3, offset, true);
  2393. }
  2394. struct ggml_tensor * ggml_set_1d(
  2395. struct ggml_context * ctx,
  2396. struct ggml_tensor * a,
  2397. struct ggml_tensor * b,
  2398. size_t offset) {
  2399. return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, false);
  2400. }
  2401. struct ggml_tensor * ggml_set_1d_inplace(
  2402. struct ggml_context * ctx,
  2403. struct ggml_tensor * a,
  2404. struct ggml_tensor * b,
  2405. size_t offset) {
  2406. return ggml_set_impl(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], offset, true);
  2407. }
  2408. struct ggml_tensor * ggml_set_2d(
  2409. struct ggml_context * ctx,
  2410. struct ggml_tensor * a,
  2411. struct ggml_tensor * b,
  2412. size_t nb1,
  2413. size_t offset) {
  2414. return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
  2415. }
  2416. struct ggml_tensor * ggml_set_2d_inplace(
  2417. struct ggml_context * ctx,
  2418. struct ggml_tensor * a,
  2419. struct ggml_tensor * b,
  2420. size_t nb1,
  2421. size_t offset) {
  2422. return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
  2423. }
  2424. // ggml_cpy
  2425. static struct ggml_tensor * ggml_cpy_impl(
  2426. struct ggml_context * ctx,
  2427. struct ggml_tensor * a,
  2428. struct ggml_tensor * b) {
  2429. GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
  2430. // make a view of the destination
  2431. struct ggml_tensor * result = ggml_view_tensor(ctx, b);
  2432. if (strlen(b->name) > 0) {
  2433. ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
  2434. } else {
  2435. ggml_format_name(result, "%s (copy)", a->name);
  2436. }
  2437. result->op = GGML_OP_CPY;
  2438. result->src[0] = a;
  2439. result->src[1] = b;
  2440. return result;
  2441. }
  2442. struct ggml_tensor * ggml_cpy(
  2443. struct ggml_context * ctx,
  2444. struct ggml_tensor * a,
  2445. struct ggml_tensor * b) {
  2446. return ggml_cpy_impl(ctx, a, b);
  2447. }
  2448. struct ggml_tensor * ggml_cast(
  2449. struct ggml_context * ctx,
  2450. struct ggml_tensor * a,
  2451. enum ggml_type type) {
  2452. struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
  2453. ggml_format_name(result, "%s (copy)", a->name);
  2454. result->op = GGML_OP_CPY;
  2455. result->src[0] = a;
  2456. result->src[1] = result;
  2457. return result;
  2458. }
  2459. // ggml_cont
  2460. static struct ggml_tensor * ggml_cont_impl(
  2461. struct ggml_context * ctx,
  2462. struct ggml_tensor * a) {
  2463. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  2464. ggml_format_name(result, "%s (cont)", a->name);
  2465. result->op = GGML_OP_CONT;
  2466. result->src[0] = a;
  2467. return result;
  2468. }
  2469. struct ggml_tensor * ggml_cont(
  2470. struct ggml_context * ctx,
  2471. struct ggml_tensor * a) {
  2472. return ggml_cont_impl(ctx, a);
  2473. }
  2474. // make contiguous, with new shape
  2475. GGML_API struct ggml_tensor * ggml_cont_1d(
  2476. struct ggml_context * ctx,
  2477. struct ggml_tensor * a,
  2478. int64_t ne0) {
  2479. return ggml_cont_4d(ctx, a, ne0, 1, 1, 1);
  2480. }
  2481. GGML_API struct ggml_tensor * ggml_cont_2d(
  2482. struct ggml_context * ctx,
  2483. struct ggml_tensor * a,
  2484. int64_t ne0,
  2485. int64_t ne1) {
  2486. return ggml_cont_4d(ctx, a, ne0, ne1, 1, 1);
  2487. }
  2488. GGML_API struct ggml_tensor * ggml_cont_3d(
  2489. struct ggml_context * ctx,
  2490. struct ggml_tensor * a,
  2491. int64_t ne0,
  2492. int64_t ne1,
  2493. int64_t ne2) {
  2494. return ggml_cont_4d(ctx, a, ne0, ne1, ne2, 1);
  2495. }
  2496. struct ggml_tensor * ggml_cont_4d(
  2497. struct ggml_context * ctx,
  2498. struct ggml_tensor * a,
  2499. int64_t ne0,
  2500. int64_t ne1,
  2501. int64_t ne2,
  2502. int64_t ne3) {
  2503. GGML_ASSERT(ggml_nelements(a) == (ne0*ne1*ne2*ne3));
  2504. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
  2505. ggml_format_name(result, "%s (cont)", a->name);
  2506. result->op = GGML_OP_CONT;
  2507. result->src[0] = a;
  2508. return result;
  2509. }
  2510. // ggml_reshape
  2511. struct ggml_tensor * ggml_reshape(
  2512. struct ggml_context * ctx,
  2513. struct ggml_tensor * a,
  2514. struct ggml_tensor * b) {
  2515. GGML_ASSERT(ggml_is_contiguous(a));
  2516. // as only the shape of b is relevant, and not its memory layout, b is allowed to be non contiguous.
  2517. GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
  2518. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b->ne, a, 0);
  2519. ggml_format_name(result, "%s (reshaped)", a->name);
  2520. result->op = GGML_OP_RESHAPE;
  2521. result->src[0] = a;
  2522. return result;
  2523. }
  2524. struct ggml_tensor * ggml_reshape_1d(
  2525. struct ggml_context * ctx,
  2526. struct ggml_tensor * a,
  2527. int64_t ne0) {
  2528. GGML_ASSERT(ggml_is_contiguous(a));
  2529. GGML_ASSERT(ggml_nelements(a) == ne0);
  2530. const int64_t ne[1] = { ne0 };
  2531. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a, 0);
  2532. ggml_format_name(result, "%s (reshaped)", a->name);
  2533. result->op = GGML_OP_RESHAPE;
  2534. result->src[0] = a;
  2535. return result;
  2536. }
  2537. struct ggml_tensor * ggml_reshape_2d(
  2538. struct ggml_context * ctx,
  2539. struct ggml_tensor * a,
  2540. int64_t ne0,
  2541. int64_t ne1) {
  2542. GGML_ASSERT(ggml_is_contiguous(a));
  2543. GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
  2544. const int64_t ne[2] = { ne0, ne1 };
  2545. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a, 0);
  2546. ggml_format_name(result, "%s (reshaped)", a->name);
  2547. result->op = GGML_OP_RESHAPE;
  2548. result->src[0] = a;
  2549. return result;
  2550. }
  2551. struct ggml_tensor * ggml_reshape_3d(
  2552. struct ggml_context * ctx,
  2553. struct ggml_tensor * a,
  2554. int64_t ne0,
  2555. int64_t ne1,
  2556. int64_t ne2) {
  2557. GGML_ASSERT(ggml_is_contiguous(a));
  2558. GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
  2559. const int64_t ne[3] = { ne0, ne1, ne2 };
  2560. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a, 0);
  2561. ggml_format_name(result, "%s (reshaped)", a->name);
  2562. result->op = GGML_OP_RESHAPE;
  2563. result->src[0] = a;
  2564. return result;
  2565. }
  2566. struct ggml_tensor * ggml_reshape_4d(
  2567. struct ggml_context * ctx,
  2568. struct ggml_tensor * a,
  2569. int64_t ne0,
  2570. int64_t ne1,
  2571. int64_t ne2,
  2572. int64_t ne3) {
  2573. GGML_ASSERT(ggml_is_contiguous(a));
  2574. GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2*ne3);
  2575. const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
  2576. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a, 0);
  2577. ggml_format_name(result, "%s (reshaped)", a->name);
  2578. result->op = GGML_OP_RESHAPE;
  2579. result->src[0] = a;
  2580. return result;
  2581. }
  2582. static struct ggml_tensor * ggml_view_impl(
  2583. struct ggml_context * ctx,
  2584. struct ggml_tensor * a,
  2585. int n_dims,
  2586. const int64_t * ne,
  2587. size_t offset) {
  2588. struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
  2589. ggml_format_name(result, "%s (view)", a->name);
  2590. ggml_set_op_params(result, &offset, sizeof(offset));
  2591. result->op = GGML_OP_VIEW;
  2592. result->src[0] = a;
  2593. return result;
  2594. }
  2595. // ggml_view_1d
  2596. struct ggml_tensor * ggml_view_1d(
  2597. struct ggml_context * ctx,
  2598. struct ggml_tensor * a,
  2599. int64_t ne0,
  2600. size_t offset) {
  2601. struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
  2602. return result;
  2603. }
  2604. // ggml_view_2d
  2605. struct ggml_tensor * ggml_view_2d(
  2606. struct ggml_context * ctx,
  2607. struct ggml_tensor * a,
  2608. int64_t ne0,
  2609. int64_t ne1,
  2610. size_t nb1,
  2611. size_t offset) {
  2612. const int64_t ne[2] = { ne0, ne1 };
  2613. struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
  2614. result->nb[1] = nb1;
  2615. result->nb[2] = result->nb[1]*ne1;
  2616. result->nb[3] = result->nb[2];
  2617. return result;
  2618. }
  2619. // ggml_view_3d
  2620. struct ggml_tensor * ggml_view_3d(
  2621. struct ggml_context * ctx,
  2622. struct ggml_tensor * a,
  2623. int64_t ne0,
  2624. int64_t ne1,
  2625. int64_t ne2,
  2626. size_t nb1,
  2627. size_t nb2,
  2628. size_t offset) {
  2629. const int64_t ne[3] = { ne0, ne1, ne2 };
  2630. struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
  2631. result->nb[1] = nb1;
  2632. result->nb[2] = nb2;
  2633. result->nb[3] = result->nb[2]*ne2;
  2634. return result;
  2635. }
  2636. // ggml_view_4d
  2637. struct ggml_tensor * ggml_view_4d(
  2638. struct ggml_context * ctx,
  2639. struct ggml_tensor * a,
  2640. int64_t ne0,
  2641. int64_t ne1,
  2642. int64_t ne2,
  2643. int64_t ne3,
  2644. size_t nb1,
  2645. size_t nb2,
  2646. size_t nb3,
  2647. size_t offset) {
  2648. const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
  2649. struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
  2650. result->nb[1] = nb1;
  2651. result->nb[2] = nb2;
  2652. result->nb[3] = nb3;
  2653. return result;
  2654. }
  2655. // ggml_permute
  2656. struct ggml_tensor * ggml_permute(
  2657. struct ggml_context * ctx,
  2658. struct ggml_tensor * a,
  2659. int axis0,
  2660. int axis1,
  2661. int axis2,
  2662. int axis3) {
  2663. GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
  2664. GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
  2665. GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
  2666. GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
  2667. GGML_ASSERT(axis0 != axis1);
  2668. GGML_ASSERT(axis0 != axis2);
  2669. GGML_ASSERT(axis0 != axis3);
  2670. GGML_ASSERT(axis1 != axis2);
  2671. GGML_ASSERT(axis1 != axis3);
  2672. GGML_ASSERT(axis2 != axis3);
  2673. struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  2674. ggml_format_name(result, "%s (permuted)", a->name);
  2675. int ne[GGML_MAX_DIMS];
  2676. int nb[GGML_MAX_DIMS];
  2677. ne[axis0] = a->ne[0];
  2678. ne[axis1] = a->ne[1];
  2679. ne[axis2] = a->ne[2];
  2680. ne[axis3] = a->ne[3];
  2681. nb[axis0] = a->nb[0];
  2682. nb[axis1] = a->nb[1];
  2683. nb[axis2] = a->nb[2];
  2684. nb[axis3] = a->nb[3];
  2685. result->ne[0] = ne[0];
  2686. result->ne[1] = ne[1];
  2687. result->ne[2] = ne[2];
  2688. result->ne[3] = ne[3];
  2689. result->nb[0] = nb[0];
  2690. result->nb[1] = nb[1];
  2691. result->nb[2] = nb[2];
  2692. result->nb[3] = nb[3];
  2693. result->op = GGML_OP_PERMUTE;
  2694. result->src[0] = a;
  2695. int32_t params[] = { axis0, axis1, axis2, axis3 };
  2696. ggml_set_op_params(result, params, sizeof(params));
  2697. return result;
  2698. }
  2699. // ggml_transpose
  2700. struct ggml_tensor * ggml_transpose(
  2701. struct ggml_context * ctx,
  2702. struct ggml_tensor * a) {
  2703. struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  2704. ggml_format_name(result, "%s (transposed)", a->name);
  2705. result->ne[0] = a->ne[1];
  2706. result->ne[1] = a->ne[0];
  2707. result->nb[0] = a->nb[1];
  2708. result->nb[1] = a->nb[0];
  2709. result->op = GGML_OP_TRANSPOSE;
  2710. result->src[0] = a;
  2711. return result;
  2712. }
  2713. // ggml_get_rows
  2714. struct ggml_tensor * ggml_get_rows(
  2715. struct ggml_context * ctx,
  2716. struct ggml_tensor * a,
  2717. struct ggml_tensor * b) {
  2718. GGML_ASSERT(a->ne[2] == b->ne[1]);
  2719. GGML_ASSERT(b->ne[3] == 1);
  2720. GGML_ASSERT(b->type == GGML_TYPE_I32);
  2721. // TODO: implement non F32 return
  2722. enum ggml_type type = GGML_TYPE_F32;
  2723. if (a->type == GGML_TYPE_I32) {
  2724. type = a->type;
  2725. }
  2726. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
  2727. result->op = GGML_OP_GET_ROWS;
  2728. result->src[0] = a;
  2729. result->src[1] = b;
  2730. return result;
  2731. }
  2732. // ggml_get_rows_back
  2733. struct ggml_tensor * ggml_get_rows_back(
  2734. struct ggml_context * ctx,
  2735. struct ggml_tensor * a,
  2736. struct ggml_tensor * b,
  2737. struct ggml_tensor * c) {
  2738. GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
  2739. GGML_ASSERT(ggml_is_matrix(c) && (a->ne[0] == c->ne[0]));
  2740. // TODO: implement non F32 return
  2741. //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
  2742. struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, c->ne[0], c->ne[1]);
  2743. result->op = GGML_OP_GET_ROWS_BACK;
  2744. result->src[0] = a;
  2745. result->src[1] = b;
  2746. return result;
  2747. }
  2748. // ggml_diag
  2749. struct ggml_tensor * ggml_diag(
  2750. struct ggml_context * ctx,
  2751. struct ggml_tensor * a) {
  2752. GGML_ASSERT(a->ne[1] == 1);
  2753. const int64_t ne[4] = { a->ne[0], a->ne[0], a->ne[2], a->ne[3] };
  2754. struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, 4, ne);
  2755. result->op = GGML_OP_DIAG;
  2756. result->src[0] = a;
  2757. return result;
  2758. }
  2759. // ggml_diag_mask_inf
  2760. static struct ggml_tensor * ggml_diag_mask_inf_impl(
  2761. struct ggml_context * ctx,
  2762. struct ggml_tensor * a,
  2763. int n_past,
  2764. bool inplace) {
  2765. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2766. int32_t params[] = { n_past };
  2767. ggml_set_op_params(result, params, sizeof(params));
  2768. result->op = GGML_OP_DIAG_MASK_INF;
  2769. result->src[0] = a;
  2770. return result;
  2771. }
  2772. struct ggml_tensor * ggml_diag_mask_inf(
  2773. struct ggml_context * ctx,
  2774. struct ggml_tensor * a,
  2775. int n_past) {
  2776. return ggml_diag_mask_inf_impl(ctx, a, n_past, false);
  2777. }
  2778. struct ggml_tensor * ggml_diag_mask_inf_inplace(
  2779. struct ggml_context * ctx,
  2780. struct ggml_tensor * a,
  2781. int n_past) {
  2782. return ggml_diag_mask_inf_impl(ctx, a, n_past, true);
  2783. }
  2784. // ggml_diag_mask_zero
  2785. static struct ggml_tensor * ggml_diag_mask_zero_impl(
  2786. struct ggml_context * ctx,
  2787. struct ggml_tensor * a,
  2788. int n_past,
  2789. bool inplace) {
  2790. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2791. int32_t params[] = { n_past };
  2792. ggml_set_op_params(result, params, sizeof(params));
  2793. result->op = GGML_OP_DIAG_MASK_ZERO;
  2794. result->src[0] = a;
  2795. return result;
  2796. }
  2797. struct ggml_tensor * ggml_diag_mask_zero(
  2798. struct ggml_context * ctx,
  2799. struct ggml_tensor * a,
  2800. int n_past) {
  2801. return ggml_diag_mask_zero_impl(ctx, a, n_past, false);
  2802. }
  2803. struct ggml_tensor * ggml_diag_mask_zero_inplace(
  2804. struct ggml_context * ctx,
  2805. struct ggml_tensor * a,
  2806. int n_past) {
  2807. return ggml_diag_mask_zero_impl(ctx, a, n_past, true);
  2808. }
  2809. // ggml_soft_max
  2810. static struct ggml_tensor * ggml_soft_max_impl(
  2811. struct ggml_context * ctx,
  2812. struct ggml_tensor * a,
  2813. struct ggml_tensor * mask,
  2814. float scale,
  2815. float max_bias,
  2816. bool inplace) {
  2817. GGML_ASSERT(ggml_is_contiguous(a));
  2818. if (mask) {
  2819. GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32);
  2820. GGML_ASSERT(ggml_is_contiguous(mask));
  2821. GGML_ASSERT(ggml_is_matrix(mask));
  2822. GGML_ASSERT(mask->ne[0] == a->ne[0]);
  2823. GGML_ASSERT(mask->ne[1] >= a->ne[1]);
  2824. }
  2825. if (max_bias > 0.0f) {
  2826. GGML_ASSERT(mask);
  2827. }
  2828. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2829. float params[] = { scale, max_bias };
  2830. ggml_set_op_params(result, params, sizeof(params));
  2831. result->op = GGML_OP_SOFT_MAX;
  2832. result->src[0] = a;
  2833. result->src[1] = mask;
  2834. return result;
  2835. }
  2836. struct ggml_tensor * ggml_soft_max(
  2837. struct ggml_context * ctx,
  2838. struct ggml_tensor * a) {
  2839. return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
  2840. }
  2841. struct ggml_tensor * ggml_soft_max_inplace(
  2842. struct ggml_context * ctx,
  2843. struct ggml_tensor * a) {
  2844. return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
  2845. }
  2846. struct ggml_tensor * ggml_soft_max_ext(
  2847. struct ggml_context * ctx,
  2848. struct ggml_tensor * a,
  2849. struct ggml_tensor * mask,
  2850. float scale,
  2851. float max_bias) {
  2852. return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
  2853. }
  2854. // ggml_soft_max_back
  2855. static struct ggml_tensor * ggml_soft_max_back_impl(
  2856. struct ggml_context * ctx,
  2857. struct ggml_tensor * a,
  2858. struct ggml_tensor * b,
  2859. bool inplace) {
  2860. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2861. result->op = GGML_OP_SOFT_MAX_BACK;
  2862. result->src[0] = a;
  2863. result->src[1] = b;
  2864. return result;
  2865. }
  2866. struct ggml_tensor * ggml_soft_max_back(
  2867. struct ggml_context * ctx,
  2868. struct ggml_tensor * a,
  2869. struct ggml_tensor * b) {
  2870. return ggml_soft_max_back_impl(ctx, a, b, false);
  2871. }
  2872. struct ggml_tensor * ggml_soft_max_back_inplace(
  2873. struct ggml_context * ctx,
  2874. struct ggml_tensor * a,
  2875. struct ggml_tensor * b) {
  2876. return ggml_soft_max_back_impl(ctx, a, b, true);
  2877. }
  2878. // ggml_rope
  2879. static struct ggml_tensor * ggml_rope_impl(
  2880. struct ggml_context * ctx,
  2881. struct ggml_tensor * a,
  2882. struct ggml_tensor * b,
  2883. struct ggml_tensor * c,
  2884. int n_dims,
  2885. int mode,
  2886. int n_ctx_orig,
  2887. float freq_base,
  2888. float freq_scale,
  2889. float ext_factor,
  2890. float attn_factor,
  2891. float beta_fast,
  2892. float beta_slow,
  2893. bool inplace) {
  2894. GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
  2895. GGML_ASSERT(ggml_is_vector(b));
  2896. GGML_ASSERT(b->type == GGML_TYPE_I32);
  2897. GGML_ASSERT(a->ne[2] == b->ne[0]);
  2898. if (c) {
  2899. GGML_ASSERT(c->type == GGML_TYPE_F32);
  2900. GGML_ASSERT(c->ne[0] >= n_dims / 2);
  2901. }
  2902. int sections[4] = {0, 0, 0, 0};
  2903. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  2904. int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
  2905. memcpy(params + 5, &freq_base, sizeof(float));
  2906. memcpy(params + 6, &freq_scale, sizeof(float));
  2907. memcpy(params + 7, &ext_factor, sizeof(float));
  2908. memcpy(params + 8, &attn_factor, sizeof(float));
  2909. memcpy(params + 9, &beta_fast, sizeof(float));
  2910. memcpy(params + 10, &beta_slow, sizeof(float));
  2911. memcpy(params + 11, &sections, sizeof(int)*4);
  2912. ggml_set_op_params(result, params, sizeof(params));
  2913. result->op = GGML_OP_ROPE;
  2914. result->src[0] = a;
  2915. result->src[1] = b;
  2916. result->src[2] = c;
  2917. return result;
  2918. }
  2919. struct ggml_tensor * ggml_rope(
  2920. struct ggml_context * ctx,
  2921. struct ggml_tensor * a,
  2922. struct ggml_tensor * b,
  2923. int n_dims,
  2924. int mode) {
  2925. return ggml_rope_impl(
  2926. ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
  2927. );
  2928. }
  2929. struct ggml_tensor * ggml_rope_multi(
  2930. struct ggml_context * ctx,
  2931. struct ggml_tensor * a,
  2932. struct ggml_tensor * b,
  2933. struct ggml_tensor * c,
  2934. int n_dims,
  2935. int sections[4],
  2936. int mode,
  2937. int n_ctx_orig,
  2938. float freq_base,
  2939. float freq_scale,
  2940. float ext_factor,
  2941. float attn_factor,
  2942. float beta_fast,
  2943. float beta_slow) {
  2944. // Multimodal Rotary Position Embedding
  2945. GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
  2946. GGML_ASSERT(ggml_is_vector(b));
  2947. GGML_ASSERT(b->type == GGML_TYPE_I32);
  2948. GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
  2949. if (c) {
  2950. GGML_ASSERT(c->type == GGML_TYPE_F32);
  2951. GGML_ASSERT(c->ne[0] >= n_dims / 2);
  2952. }
  2953. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  2954. int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
  2955. memcpy(params + 5, &freq_base, sizeof(float));
  2956. memcpy(params + 6, &freq_scale, sizeof(float));
  2957. memcpy(params + 7, &ext_factor, sizeof(float));
  2958. memcpy(params + 8, &attn_factor, sizeof(float));
  2959. memcpy(params + 9, &beta_fast, sizeof(float));
  2960. memcpy(params + 10, &beta_slow, sizeof(float));
  2961. memcpy(&params[11], sections, sizeof(int)*4);
  2962. ggml_set_op_params(result, params, sizeof(params));
  2963. result->op = GGML_OP_ROPE;
  2964. result->src[0] = a;
  2965. result->src[1] = b;
  2966. result->src[2] = c;
  2967. return result;
  2968. }
  2969. struct ggml_tensor * ggml_rope_inplace(
  2970. struct ggml_context * ctx,
  2971. struct ggml_tensor * a,
  2972. struct ggml_tensor * b,
  2973. int n_dims,
  2974. int mode) {
  2975. return ggml_rope_impl(
  2976. ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
  2977. );
  2978. }
  2979. struct ggml_tensor * ggml_rope_ext(
  2980. struct ggml_context * ctx,
  2981. struct ggml_tensor * a,
  2982. struct ggml_tensor * b,
  2983. struct ggml_tensor * c,
  2984. int n_dims,
  2985. int mode,
  2986. int n_ctx_orig,
  2987. float freq_base,
  2988. float freq_scale,
  2989. float ext_factor,
  2990. float attn_factor,
  2991. float beta_fast,
  2992. float beta_slow) {
  2993. return ggml_rope_impl(
  2994. ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
  2995. ext_factor, attn_factor, beta_fast, beta_slow, false
  2996. );
  2997. }
  2998. struct ggml_tensor * ggml_rope_ext_inplace(
  2999. struct ggml_context * ctx,
  3000. struct ggml_tensor * a,
  3001. struct ggml_tensor * b,
  3002. struct ggml_tensor * c,
  3003. int n_dims,
  3004. int mode,
  3005. int n_ctx_orig,
  3006. float freq_base,
  3007. float freq_scale,
  3008. float ext_factor,
  3009. float attn_factor,
  3010. float beta_fast,
  3011. float beta_slow) {
  3012. return ggml_rope_impl(
  3013. ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
  3014. ext_factor, attn_factor, beta_fast, beta_slow, true
  3015. );
  3016. }
  3017. struct ggml_tensor * ggml_rope_custom(
  3018. struct ggml_context * ctx,
  3019. struct ggml_tensor * a,
  3020. struct ggml_tensor * b,
  3021. int n_dims,
  3022. int mode,
  3023. int n_ctx_orig,
  3024. float freq_base,
  3025. float freq_scale,
  3026. float ext_factor,
  3027. float attn_factor,
  3028. float beta_fast,
  3029. float beta_slow) {
  3030. return ggml_rope_impl(
  3031. ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
  3032. ext_factor, attn_factor, beta_fast, beta_slow, false
  3033. );
  3034. }
  3035. struct ggml_tensor * ggml_rope_custom_inplace(
  3036. struct ggml_context * ctx,
  3037. struct ggml_tensor * a,
  3038. struct ggml_tensor * b,
  3039. int n_dims,
  3040. int mode,
  3041. int n_ctx_orig,
  3042. float freq_base,
  3043. float freq_scale,
  3044. float ext_factor,
  3045. float attn_factor,
  3046. float beta_fast,
  3047. float beta_slow) {
  3048. return ggml_rope_impl(
  3049. ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
  3050. ext_factor, attn_factor, beta_fast, beta_slow, true
  3051. );
  3052. }
  3053. // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
  3054. // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
  3055. static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
  3056. return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
  3057. }
  3058. void ggml_rope_yarn_corr_dims(
  3059. int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
  3060. ) {
  3061. // start and end correction dims
  3062. float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
  3063. float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
  3064. dims[0] = MAX(0, start);
  3065. dims[1] = MIN(n_dims - 1, end);
  3066. }
  3067. // ggml_rope_back
  3068. struct ggml_tensor * ggml_rope_back(
  3069. struct ggml_context * ctx,
  3070. struct ggml_tensor * a,
  3071. struct ggml_tensor * b,
  3072. struct ggml_tensor * c,
  3073. int n_dims,
  3074. int mode,
  3075. int n_ctx_orig,
  3076. float freq_base,
  3077. float freq_scale,
  3078. float ext_factor,
  3079. float attn_factor,
  3080. float beta_fast,
  3081. float beta_slow) {
  3082. GGML_ASSERT(ggml_is_vector(b));
  3083. GGML_ASSERT(b->type == GGML_TYPE_I32);
  3084. GGML_ASSERT(a->ne[2] == b->ne[0]);
  3085. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  3086. int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
  3087. memcpy(params + 5, &freq_base, sizeof(float));
  3088. memcpy(params + 6, &freq_scale, sizeof(float));
  3089. memcpy(params + 7, &ext_factor, sizeof(float));
  3090. memcpy(params + 8, &attn_factor, sizeof(float));
  3091. memcpy(params + 9, &beta_fast, sizeof(float));
  3092. memcpy(params + 10, &beta_slow, sizeof(float));
  3093. ggml_set_op_params(result, params, sizeof(params));
  3094. result->op = GGML_OP_ROPE_BACK;
  3095. result->src[0] = a;
  3096. result->src[1] = b;
  3097. result->src[2] = c;
  3098. return result;
  3099. }
  3100. // ggml_clamp
  3101. struct ggml_tensor * ggml_clamp(
  3102. struct ggml_context * ctx,
  3103. struct ggml_tensor * a,
  3104. float min,
  3105. float max) {
  3106. // TODO: when implement backward, fix this:
  3107. struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  3108. float params[] = { min, max };
  3109. ggml_set_op_params(result, params, sizeof(params));
  3110. result->op = GGML_OP_CLAMP;
  3111. result->src[0] = a;
  3112. return result;
  3113. }
  3114. // ggml_conv_1d
  3115. static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
  3116. return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
  3117. }
  3118. GGML_API struct ggml_tensor * ggml_conv_1d(
  3119. struct ggml_context * ctx,
  3120. struct ggml_tensor * a,
  3121. struct ggml_tensor * b,
  3122. int s0,
  3123. int p0,
  3124. int d0) {
  3125. struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
  3126. struct ggml_tensor * result =
  3127. ggml_mul_mat(ctx,
  3128. ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
  3129. ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K]
  3130. result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
  3131. return result;
  3132. }
  3133. // ggml_conv_1d_ph
  3134. struct ggml_tensor* ggml_conv_1d_ph(
  3135. struct ggml_context * ctx,
  3136. struct ggml_tensor * a,
  3137. struct ggml_tensor * b,
  3138. int s,
  3139. int d) {
  3140. return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
  3141. }
  3142. // ggml_conv_transpose_1d
  3143. static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
  3144. return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
  3145. }
  3146. GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
  3147. struct ggml_context * ctx,
  3148. struct ggml_tensor * a,
  3149. struct ggml_tensor * b,
  3150. int s0,
  3151. int p0,
  3152. int d0) {
  3153. GGML_ASSERT(ggml_is_matrix(b));
  3154. GGML_ASSERT(a->ne[2] == b->ne[1]);
  3155. GGML_ASSERT(a->ne[3] == 1);
  3156. GGML_ASSERT(p0 == 0);
  3157. GGML_ASSERT(d0 == 1);
  3158. const int64_t ne[4] = {
  3159. ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
  3160. a->ne[1], b->ne[2], 1,
  3161. };
  3162. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3163. int32_t params[] = { s0, p0, d0 };
  3164. ggml_set_op_params(result, params, sizeof(params));
  3165. result->op = GGML_OP_CONV_TRANSPOSE_1D;
  3166. result->src[0] = a;
  3167. result->src[1] = b;
  3168. return result;
  3169. }
  3170. // ggml_conv_depthwise
  3171. struct ggml_tensor * ggml_conv_depthwise_2d(
  3172. struct ggml_context * ctx,
  3173. struct ggml_tensor * a,
  3174. struct ggml_tensor * b,
  3175. int s0,
  3176. int s1,
  3177. int p0,
  3178. int p1,
  3179. int d0,
  3180. int d1) {
  3181. struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
  3182. struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
  3183. ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
  3184. s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
  3185. struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
  3186. new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
  3187. struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b);
  3188. result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
  3189. return result;
  3190. }
  3191. // ggml_conv_2d
  3192. // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
  3193. // a: [OC,IC, KH, KW]
  3194. // b: [N, IC, IH, IW]
  3195. // result: [N, OH, OW, IC*KH*KW]
  3196. struct ggml_tensor * ggml_im2col(
  3197. struct ggml_context * ctx,
  3198. struct ggml_tensor * a,
  3199. struct ggml_tensor * b,
  3200. int s0,
  3201. int s1,
  3202. int p0,
  3203. int p1,
  3204. int d0,
  3205. int d1,
  3206. bool is_2D,
  3207. enum ggml_type dst_type) {
  3208. if(is_2D) {
  3209. GGML_ASSERT(a->ne[2] == b->ne[2]);
  3210. } else {
  3211. GGML_ASSERT(a->ne[1] == b->ne[1]);
  3212. GGML_ASSERT(b->ne[3] == 1);
  3213. }
  3214. const int64_t OH = is_2D ? ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
  3215. const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
  3216. GGML_ASSERT((!is_2D || OH > 0) && "b too small compared to a");
  3217. GGML_ASSERT((OW > 0) && "b too small compared to a");
  3218. const int64_t ne[4] = {
  3219. is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
  3220. OW,
  3221. is_2D ? OH : b->ne[2],
  3222. is_2D ? b->ne[3] : 1,
  3223. };
  3224. struct ggml_tensor * result = ggml_new_tensor(ctx, dst_type, 4, ne);
  3225. int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
  3226. ggml_set_op_params(result, params, sizeof(params));
  3227. result->op = GGML_OP_IM2COL;
  3228. result->src[0] = a;
  3229. result->src[1] = b;
  3230. return result;
  3231. }
  3232. struct ggml_tensor * ggml_im2col_back(
  3233. struct ggml_context * ctx,
  3234. struct ggml_tensor * a,
  3235. struct ggml_tensor * b,
  3236. int64_t * ne,
  3237. int s0,
  3238. int s1,
  3239. int p0,
  3240. int p1,
  3241. int d0,
  3242. int d1,
  3243. bool is_2D) {
  3244. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3245. int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
  3246. ggml_set_op_params(result, params, sizeof(params));
  3247. result->op = GGML_OP_IM2COL_BACK;
  3248. result->src[0] = a;
  3249. result->src[1] = b;
  3250. return result;
  3251. }
  3252. // a: [OC,IC, KH, KW]
  3253. // b: [N, IC, IH, IW]
  3254. // result: [N, OC, OH, OW]
  3255. struct ggml_tensor * ggml_conv_2d(
  3256. struct ggml_context * ctx,
  3257. struct ggml_tensor * a,
  3258. struct ggml_tensor * b,
  3259. int s0,
  3260. int s1,
  3261. int p0,
  3262. int p1,
  3263. int d0,
  3264. int d1) {
  3265. struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
  3266. struct ggml_tensor * result =
  3267. ggml_mul_mat(ctx,
  3268. ggml_reshape_2d(ctx, im2col, im2col->ne[0], im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
  3269. ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]), a->ne[3])); // [OC,IC, KH, KW] => [OC, IC * KH * KW]
  3270. result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
  3271. result = ggml_cont(ctx, ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
  3272. return result;
  3273. }
  3274. // ggml_conv_2d_sk_p0
  3275. struct ggml_tensor * ggml_conv_2d_sk_p0(
  3276. struct ggml_context * ctx,
  3277. struct ggml_tensor * a,
  3278. struct ggml_tensor * b) {
  3279. return ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
  3280. }
  3281. // ggml_conv_2d_s1_ph
  3282. struct ggml_tensor * ggml_conv_2d_s1_ph(
  3283. struct ggml_context * ctx,
  3284. struct ggml_tensor * a,
  3285. struct ggml_tensor * b) {
  3286. return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
  3287. }
  3288. // ggml_conv_transpose_2d_p0
  3289. static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
  3290. return (ins - 1) * s - 2 * p + ks;
  3291. }
  3292. struct ggml_tensor * ggml_conv_transpose_2d_p0(
  3293. struct ggml_context * ctx,
  3294. struct ggml_tensor * a,
  3295. struct ggml_tensor * b,
  3296. int stride) {
  3297. GGML_ASSERT(a->ne[3] == b->ne[2]);
  3298. const int64_t ne[4] = {
  3299. ggml_calc_conv_transpose_output_size(b->ne[0], a->ne[0], stride, 0 /*p0*/),
  3300. ggml_calc_conv_transpose_output_size(b->ne[1], a->ne[1], stride, 0 /*p1*/),
  3301. a->ne[2], b->ne[3],
  3302. };
  3303. struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3304. ggml_set_op_params_i32(result, 0, stride);
  3305. result->op = GGML_OP_CONV_TRANSPOSE_2D;
  3306. result->src[0] = a;
  3307. result->src[1] = b;
  3308. return result;
  3309. }
  3310. // ggml_pool_*
  3311. static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, float p) {
  3312. return (ins + 2 * p - ks) / s + 1;
  3313. }
  3314. // ggml_pool_1d
  3315. struct ggml_tensor * ggml_pool_1d(
  3316. struct ggml_context * ctx,
  3317. struct ggml_tensor * a,
  3318. enum ggml_op_pool op,
  3319. int k0,
  3320. int s0,
  3321. int p0) {
  3322. const int64_t ne[4] = {
  3323. ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
  3324. a->ne[1],
  3325. a->ne[2],
  3326. a->ne[3],
  3327. };
  3328. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3329. int32_t params[] = { op, k0, s0, p0 };
  3330. ggml_set_op_params(result, params, sizeof(params));
  3331. result->op = GGML_OP_POOL_1D;
  3332. result->src[0] = a;
  3333. return result;
  3334. }
  3335. // ggml_pool_2d
  3336. struct ggml_tensor * ggml_pool_2d(
  3337. struct ggml_context * ctx,
  3338. struct ggml_tensor * a,
  3339. enum ggml_op_pool op,
  3340. int k0,
  3341. int k1,
  3342. int s0,
  3343. int s1,
  3344. float p0,
  3345. float p1) {
  3346. struct ggml_tensor * result;
  3347. const int64_t ne[4] = {
  3348. ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
  3349. ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
  3350. a->ne[2],
  3351. a->ne[3],
  3352. };
  3353. result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3354. int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
  3355. ggml_set_op_params(result, params, sizeof(params));
  3356. result->op = GGML_OP_POOL_2D;
  3357. result->src[0] = a;
  3358. return result;
  3359. }
  3360. struct ggml_tensor * ggml_pool_2d_back(
  3361. struct ggml_context * ctx,
  3362. struct ggml_tensor * a,
  3363. struct ggml_tensor * af,
  3364. enum ggml_op_pool op,
  3365. int k0,
  3366. int k1,
  3367. int s0,
  3368. int s1,
  3369. float p0,
  3370. float p1) {
  3371. struct ggml_tensor * result;
  3372. result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, af->ne);
  3373. int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
  3374. ggml_set_op_params(result, params, sizeof(params));
  3375. result->op = GGML_OP_POOL_2D_BACK;
  3376. result->src[0] = a;
  3377. result->src[1] = af;
  3378. return result;
  3379. }
  3380. // ggml_upscale
  3381. static struct ggml_tensor * ggml_upscale_impl(
  3382. struct ggml_context * ctx,
  3383. struct ggml_tensor * a,
  3384. int ne0,
  3385. int ne1,
  3386. int ne2,
  3387. int ne3) {
  3388. GGML_ASSERT(a->ne[0] <= ne0);
  3389. GGML_ASSERT(a->ne[1] <= ne1);
  3390. GGML_ASSERT(a->ne[2] <= ne2);
  3391. GGML_ASSERT(a->ne[3] <= ne3);
  3392. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
  3393. result->op = GGML_OP_UPSCALE;
  3394. result->src[0] = a;
  3395. return result;
  3396. }
  3397. struct ggml_tensor * ggml_upscale(
  3398. struct ggml_context * ctx,
  3399. struct ggml_tensor * a,
  3400. int scale_factor) {
  3401. return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
  3402. }
  3403. struct ggml_tensor * ggml_upscale_ext(
  3404. struct ggml_context * ctx,
  3405. struct ggml_tensor * a,
  3406. int ne0,
  3407. int ne1,
  3408. int ne2,
  3409. int ne3) {
  3410. return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
  3411. }
  3412. // ggml_pad
  3413. struct ggml_tensor * ggml_pad(
  3414. struct ggml_context * ctx,
  3415. struct ggml_tensor * a,
  3416. int p0,
  3417. int p1,
  3418. int p2,
  3419. int p3) {
  3420. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
  3421. a->ne[0] + p0,
  3422. a->ne[1] + p1,
  3423. a->ne[2] + p2,
  3424. a->ne[3] + p3);
  3425. result->op = GGML_OP_PAD;
  3426. result->src[0] = a;
  3427. return result;
  3428. }
  3429. // ggml_pad_reflect_1d
  3430. struct ggml_tensor * ggml_pad_reflect_1d(
  3431. struct ggml_context * ctx,
  3432. struct ggml_tensor * a,
  3433. int p0,
  3434. int p1) {
  3435. GGML_ASSERT(p0 >= 0);
  3436. GGML_ASSERT(p1 >= 0);
  3437. GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
  3438. GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
  3439. GGML_ASSERT(ggml_is_contiguous(a));
  3440. GGML_ASSERT(a->type == GGML_TYPE_F32);
  3441. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
  3442. a->ne[0] + p0 + p1,
  3443. a->ne[1],
  3444. a->ne[2],
  3445. a->ne[3]);
  3446. int32_t params[] = { p0, p1 };
  3447. ggml_set_op_params(result, params, sizeof(params));
  3448. result->op = GGML_OP_PAD_REFLECT_1D;
  3449. result->src[0] = a;
  3450. return result;
  3451. }
  3452. // ggml_unpad
  3453. struct ggml_tensor * ggml_unpad(
  3454. struct ggml_context * ctx,
  3455. struct ggml_tensor * a,
  3456. int p0, int p1, int p2, int p3) {
  3457. struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type,
  3458. a->ne[0] - p0,
  3459. a->ne[1] - p1,
  3460. a->ne[2] - p2,
  3461. a->ne[3] - p3);
  3462. result->op = GGML_OP_UNPAD;
  3463. result->src[0] = a;
  3464. return result;
  3465. }
  3466. // ggml_arange
  3467. struct ggml_tensor * ggml_arange(
  3468. struct ggml_context * ctx,
  3469. float start,
  3470. float stop,
  3471. float step) {
  3472. GGML_ASSERT(stop > start);
  3473. const int64_t steps = (int64_t) ceilf((stop - start) / step);
  3474. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, steps);
  3475. ggml_set_op_params_f32(result, 0, start);
  3476. ggml_set_op_params_f32(result, 1, stop);
  3477. ggml_set_op_params_f32(result, 2, step);
  3478. result->op = GGML_OP_ARANGE;
  3479. return result;
  3480. }
  3481. // ggml_timestep_embedding
  3482. struct ggml_tensor * ggml_timestep_embedding(
  3483. struct ggml_context * ctx,
  3484. struct ggml_tensor * timesteps,
  3485. int dim,
  3486. int max_period) {
  3487. int actual_dim = dim;
  3488. if (dim % 2 != 0) {
  3489. actual_dim = dim + 1;
  3490. }
  3491. struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, actual_dim, timesteps->ne[0]);
  3492. ggml_set_op_params_i32(result, 0, dim);
  3493. ggml_set_op_params_i32(result, 1, max_period);
  3494. result->op = GGML_OP_TIMESTEP_EMBEDDING;
  3495. result->src[0] = timesteps;
  3496. return result;
  3497. }
  3498. // ggml_argsort
  3499. struct ggml_tensor * ggml_argsort(
  3500. struct ggml_context * ctx,
  3501. struct ggml_tensor * a,
  3502. enum ggml_sort_order order) {
  3503. GGML_ASSERT(a->ne[0] <= INT32_MAX);
  3504. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
  3505. ggml_set_op_params_i32(result, 0, (int32_t) order);
  3506. result->op = GGML_OP_ARGSORT;
  3507. result->src[0] = a;
  3508. return result;
  3509. }
  3510. // ggml_top_k
  3511. struct ggml_tensor * ggml_top_k(
  3512. struct ggml_context * ctx,
  3513. struct ggml_tensor * a,
  3514. int k) {
  3515. GGML_ASSERT(a->ne[0] >= k);
  3516. struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC);
  3517. result = ggml_view_4d(ctx, result,
  3518. k, result->ne[1], result->ne[2], result->ne[3],
  3519. result->nb[1], result->nb[2], result->nb[3],
  3520. 0);
  3521. return result;
  3522. }
  3523. // ggml_flash_attn_ext
  3524. struct ggml_tensor * ggml_flash_attn_ext(
  3525. struct ggml_context * ctx,
  3526. struct ggml_tensor * q,
  3527. struct ggml_tensor * k,
  3528. struct ggml_tensor * v,
  3529. struct ggml_tensor * mask,
  3530. float scale,
  3531. float max_bias,
  3532. float logit_softcap) {
  3533. GGML_ASSERT(ggml_can_mul_mat(k, q));
  3534. // TODO: check if vT can be multiplied by (k*qT)
  3535. if (mask) {
  3536. GGML_ASSERT(ggml_is_contiguous(mask));
  3537. GGML_ASSERT(mask->ne[2] == 1);
  3538. GGML_ASSERT(mask->ne[3] == 1);
  3539. GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
  3540. "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
  3541. //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
  3542. }
  3543. if (max_bias > 0.0f) {
  3544. GGML_ASSERT(mask);
  3545. }
  3546. // permute(0, 2, 1, 3)
  3547. int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
  3548. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3549. float params[] = { scale, max_bias, logit_softcap };
  3550. ggml_set_op_params(result, params, sizeof(params));
  3551. result->op = GGML_OP_FLASH_ATTN_EXT;
  3552. result->src[0] = q;
  3553. result->src[1] = k;
  3554. result->src[2] = v;
  3555. result->src[3] = mask;
  3556. return result;
  3557. }
  3558. void ggml_flash_attn_ext_set_prec(
  3559. struct ggml_tensor * a,
  3560. enum ggml_prec prec) {
  3561. GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
  3562. const int32_t prec_i32 = (int32_t) prec;
  3563. ggml_set_op_params_i32(a, 3, prec_i32); // scale is on first pos, max_bias on second
  3564. }
  3565. enum ggml_prec ggml_flash_attn_ext_get_prec(
  3566. const struct ggml_tensor * a) {
  3567. GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
  3568. const int32_t prec_i32 = ggml_get_op_params_i32(a, 3);
  3569. return (enum ggml_prec) prec_i32;
  3570. }
  3571. // ggml_flash_attn_back
  3572. struct ggml_tensor * ggml_flash_attn_back(
  3573. struct ggml_context * ctx,
  3574. struct ggml_tensor * q,
  3575. struct ggml_tensor * k,
  3576. struct ggml_tensor * v,
  3577. struct ggml_tensor * d,
  3578. bool masked) {
  3579. GGML_ABORT("TODO: adapt to ggml_flash_attn_ext() changes");
  3580. GGML_ASSERT(ggml_can_mul_mat(k, q));
  3581. // TODO: check if vT can be multiplied by (k*qT)
  3582. // d shape [D,N,ne2,ne3]
  3583. // q shape [D,N,ne2,ne3]
  3584. // k shape [D,M,kvne2,ne3]
  3585. // v shape [M,D,kvne2,ne3]
  3586. const int64_t D = q->ne[0];
  3587. const int64_t N = q->ne[1];
  3588. const int64_t M = k->ne[1];
  3589. const int64_t ne2 = q->ne[2];
  3590. const int64_t ne3 = q->ne[3];
  3591. const int64_t kvne2 = k->ne[2];
  3592. GGML_ASSERT(k->ne[0] == D);
  3593. GGML_ASSERT(v->ne[0] == M);
  3594. GGML_ASSERT(v->ne[1] == D);
  3595. GGML_ASSERT(d->ne[0] == D);
  3596. GGML_ASSERT(d->ne[1] == N);
  3597. GGML_ASSERT(k->ne[2] == kvne2);
  3598. GGML_ASSERT(k->ne[3] == ne3);
  3599. GGML_ASSERT(v->ne[2] == kvne2);
  3600. GGML_ASSERT(v->ne[3] == ne3);
  3601. GGML_ASSERT(d->ne[2] == ne2);
  3602. GGML_ASSERT(d->ne[3] == ne3);
  3603. GGML_ASSERT(ne2 % kvne2 == 0);
  3604. // store gradients of q, k and v as continuous tensors concatenated in result.
  3605. // note: v and gradv are actually transposed, i.e. v->ne[0] != D.
  3606. const int64_t elem_q = ggml_nelements(q);
  3607. const int64_t elem_k = ggml_nelements(k);
  3608. const int64_t elem_v = ggml_nelements(v);
  3609. enum ggml_type result_type = GGML_TYPE_F32;
  3610. GGML_ASSERT(ggml_blck_size(result_type) == 1);
  3611. const size_t tsize = ggml_type_size(result_type);
  3612. const size_t offs_q = 0;
  3613. const size_t offs_k = offs_q + GGML_PAD(elem_q * tsize, GGML_MEM_ALIGN);
  3614. const size_t offs_v = offs_k + GGML_PAD(elem_k * tsize, GGML_MEM_ALIGN);
  3615. const size_t end = offs_v + GGML_PAD(elem_v * tsize, GGML_MEM_ALIGN);
  3616. const size_t nelements = (end + tsize - 1)/tsize;
  3617. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nelements);
  3618. int32_t masked_i = masked ? 1 : 0;
  3619. ggml_set_op_params(result, &masked_i, sizeof(masked_i));
  3620. result->op = GGML_OP_FLASH_ATTN_BACK;
  3621. result->src[0] = q;
  3622. result->src[1] = k;
  3623. result->src[2] = v;
  3624. result->src[3] = d;
  3625. return result;
  3626. }
  3627. // ggml_ssm_conv
  3628. struct ggml_tensor * ggml_ssm_conv(
  3629. struct ggml_context * ctx,
  3630. struct ggml_tensor * sx,
  3631. struct ggml_tensor * c) {
  3632. GGML_ASSERT(ggml_is_3d(sx));
  3633. GGML_ASSERT(ggml_is_matrix(c));
  3634. const int64_t d_conv = c->ne[0];
  3635. const int64_t d_inner = c->ne[1];
  3636. const int64_t n_t = sx->ne[0] - d_conv + 1; // tokens per sequence
  3637. const int64_t n_s = sx->ne[2];
  3638. // TODO: maybe support other strides than 1?
  3639. // FIXME: this is always true?
  3640. GGML_ASSERT(sx->ne[0] == d_conv - 1 + n_t);
  3641. GGML_ASSERT(sx->ne[1] == d_inner);
  3642. GGML_ASSERT(n_t >= 0);
  3643. struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_t, n_s);
  3644. result->op = GGML_OP_SSM_CONV;
  3645. result->src[0] = sx;
  3646. result->src[1] = c;
  3647. return result;
  3648. }
  3649. // ggml_ssm_scan
  3650. struct ggml_tensor * ggml_ssm_scan(
  3651. struct ggml_context * ctx,
  3652. struct ggml_tensor * s,
  3653. struct ggml_tensor * x,
  3654. struct ggml_tensor * dt,
  3655. struct ggml_tensor * A,
  3656. struct ggml_tensor * B,
  3657. struct ggml_tensor * C) {
  3658. GGML_ASSERT(ggml_is_contiguous(s));
  3659. GGML_ASSERT(ggml_is_contiguous(x));
  3660. GGML_ASSERT(ggml_is_contiguous(dt));
  3661. GGML_ASSERT(ggml_is_contiguous(A));
  3662. GGML_ASSERT(ggml_is_matrix(A));
  3663. GGML_ASSERT(ggml_is_3d(B));
  3664. GGML_ASSERT(ggml_is_3d(s));
  3665. GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
  3666. GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
  3667. GGML_ASSERT(ggml_are_same_shape(x, dt));
  3668. GGML_ASSERT(ggml_are_same_shape(B, C));
  3669. {
  3670. const int64_t d_state = s->ne[0];
  3671. const int64_t d_inner = s->ne[1];
  3672. const int64_t n_seq_tokens = x->ne[1];
  3673. const int64_t n_seqs = x->ne[2];
  3674. GGML_ASSERT(s->ne[2] == n_seqs);
  3675. GGML_ASSERT(x->ne[0] == d_inner);
  3676. GGML_ASSERT(A->ne[0] == d_state);
  3677. GGML_ASSERT(A->ne[1] == d_inner);
  3678. GGML_ASSERT(B->ne[0] == d_state);
  3679. GGML_ASSERT(B->ne[1] == n_seq_tokens);
  3680. GGML_ASSERT(B->ne[2] == n_seqs);
  3681. }
  3682. // concatenated y + ssm_states
  3683. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
  3684. result->op = GGML_OP_SSM_SCAN;
  3685. result->src[0] = s;
  3686. result->src[1] = x;
  3687. result->src[2] = dt;
  3688. result->src[3] = A;
  3689. result->src[4] = B;
  3690. result->src[5] = C;
  3691. return result;
  3692. }
  3693. // ggml_win_part
  3694. struct ggml_tensor * ggml_win_part(
  3695. struct ggml_context * ctx,
  3696. struct ggml_tensor * a,
  3697. int w) {
  3698. GGML_ASSERT(a->ne[3] == 1);
  3699. GGML_ASSERT(a->type == GGML_TYPE_F32);
  3700. // padding
  3701. const int px = (w - a->ne[1]%w)%w;
  3702. const int py = (w - a->ne[2]%w)%w;
  3703. const int npx = (px + a->ne[1])/w;
  3704. const int npy = (py + a->ne[2])/w;
  3705. const int np = npx*npy;
  3706. const int64_t ne[4] = { a->ne[0], w, w, np, };
  3707. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3708. int32_t params[] = { npx, npy, w };
  3709. ggml_set_op_params(result, params, sizeof(params));
  3710. result->op = GGML_OP_WIN_PART;
  3711. result->src[0] = a;
  3712. return result;
  3713. }
  3714. // ggml_win_unpart
  3715. struct ggml_tensor * ggml_win_unpart(
  3716. struct ggml_context * ctx,
  3717. struct ggml_tensor * a,
  3718. int w0,
  3719. int h0,
  3720. int w) {
  3721. GGML_ASSERT(a->type == GGML_TYPE_F32);
  3722. const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
  3723. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
  3724. int32_t params[] = { w };
  3725. ggml_set_op_params(result, params, sizeof(params));
  3726. result->op = GGML_OP_WIN_UNPART;
  3727. result->src[0] = a;
  3728. return result;
  3729. }
  3730. // ggml_get_rel_pos
  3731. struct ggml_tensor * ggml_get_rel_pos(
  3732. struct ggml_context * ctx,
  3733. struct ggml_tensor * a,
  3734. int qh,
  3735. int kh) {
  3736. GGML_ASSERT(qh == kh);
  3737. GGML_ASSERT(2*MAX(qh, kh) - 1 == a->ne[1]);
  3738. const int64_t ne[4] = { a->ne[0], kh, qh, 1, };
  3739. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 3, ne);
  3740. result->op = GGML_OP_GET_REL_POS;
  3741. result->src[0] = a;
  3742. return result;
  3743. }
  3744. // ggml_add_rel_pos
  3745. static struct ggml_tensor * ggml_add_rel_pos_impl(
  3746. struct ggml_context * ctx,
  3747. struct ggml_tensor * a,
  3748. struct ggml_tensor * pw,
  3749. struct ggml_tensor * ph,
  3750. bool inplace) {
  3751. GGML_ASSERT(ggml_are_same_shape(pw, ph));
  3752. GGML_ASSERT(ggml_is_contiguous(a));
  3753. GGML_ASSERT(ggml_is_contiguous(pw));
  3754. GGML_ASSERT(ggml_is_contiguous(ph));
  3755. GGML_ASSERT(ph->type == GGML_TYPE_F32);
  3756. GGML_ASSERT(pw->type == GGML_TYPE_F32);
  3757. GGML_ASSERT(pw->ne[3] == a->ne[2]);
  3758. GGML_ASSERT(pw->ne[0]*pw->ne[0] == a->ne[0]);
  3759. GGML_ASSERT(pw->ne[1]*pw->ne[2] == a->ne[1]);
  3760. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3761. ggml_set_op_params_i32(result, 0, inplace ? 1 : 0);
  3762. result->op = GGML_OP_ADD_REL_POS;
  3763. result->src[0] = a;
  3764. result->src[1] = pw;
  3765. result->src[2] = ph;
  3766. return result;
  3767. }
  3768. struct ggml_tensor * ggml_add_rel_pos(
  3769. struct ggml_context * ctx,
  3770. struct ggml_tensor * a,
  3771. struct ggml_tensor * pw,
  3772. struct ggml_tensor * ph) {
  3773. return ggml_add_rel_pos_impl(ctx, a, pw, ph, false);
  3774. }
  3775. struct ggml_tensor * ggml_add_rel_pos_inplace(
  3776. struct ggml_context * ctx,
  3777. struct ggml_tensor * a,
  3778. struct ggml_tensor * pw,
  3779. struct ggml_tensor * ph) {
  3780. return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
  3781. }
  3782. // ggml_rwkv_wkv6
  3783. struct ggml_tensor * ggml_rwkv_wkv6(
  3784. struct ggml_context * ctx,
  3785. struct ggml_tensor * k,
  3786. struct ggml_tensor * v,
  3787. struct ggml_tensor * r,
  3788. struct ggml_tensor * tf,
  3789. struct ggml_tensor * td,
  3790. struct ggml_tensor * state) {
  3791. GGML_ASSERT(ggml_is_contiguous(k));
  3792. GGML_ASSERT(ggml_is_contiguous(v));
  3793. GGML_ASSERT(ggml_is_contiguous(r));
  3794. GGML_ASSERT(ggml_is_contiguous(tf));
  3795. GGML_ASSERT(ggml_is_contiguous(td));
  3796. GGML_ASSERT(ggml_is_contiguous(state));
  3797. const int64_t S = k->ne[0];
  3798. const int64_t H = k->ne[2];
  3799. const int64_t n_tokens = k->ne[3];
  3800. const int64_t n_seqs = state->ne[1];
  3801. {
  3802. GGML_ASSERT(k->ne[1] == 1);
  3803. GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens);
  3804. GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens);
  3805. // TODO: RWKV v4 and v5
  3806. GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
  3807. GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs);
  3808. }
  3809. // concat output and new_state
  3810. const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
  3811. struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
  3812. result->op = GGML_OP_RWKV_WKV6;
  3813. result->src[0] = k;
  3814. result->src[1] = v;
  3815. result->src[2] = r;
  3816. result->src[3] = tf;
  3817. result->src[4] = td;
  3818. result->src[5] = state;
  3819. return result;
  3820. }
  3821. // ggml_unary
  3822. static struct ggml_tensor * ggml_unary_impl(
  3823. struct ggml_context * ctx,
  3824. struct ggml_tensor * a,
  3825. enum ggml_unary_op op,
  3826. bool inplace) {
  3827. GGML_ASSERT(ggml_is_contiguous_1(a));
  3828. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3829. ggml_set_op_params_i32(result, 0, (int32_t) op);
  3830. result->op = GGML_OP_UNARY;
  3831. result->src[0] = a;
  3832. return result;
  3833. }
  3834. struct ggml_tensor * ggml_unary(
  3835. struct ggml_context * ctx,
  3836. struct ggml_tensor * a,
  3837. enum ggml_unary_op op) {
  3838. return ggml_unary_impl(ctx, a, op, false);
  3839. }
  3840. struct ggml_tensor * ggml_unary_inplace(
  3841. struct ggml_context * ctx,
  3842. struct ggml_tensor * a,
  3843. enum ggml_unary_op op) {
  3844. return ggml_unary_impl(ctx, a, op, true);
  3845. }
  3846. // ggml_map_unary
  3847. static struct ggml_tensor * ggml_map_unary_impl_f32(
  3848. struct ggml_context * ctx,
  3849. struct ggml_tensor * a,
  3850. const ggml_unary_op_f32_t fun,
  3851. bool inplace) {
  3852. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3853. ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
  3854. result->op = GGML_OP_MAP_UNARY;
  3855. result->src[0] = a;
  3856. return result;
  3857. }
  3858. struct ggml_tensor * ggml_map_unary_f32(
  3859. struct ggml_context * ctx,
  3860. struct ggml_tensor * a,
  3861. const ggml_unary_op_f32_t fun) {
  3862. return ggml_map_unary_impl_f32(ctx, a, fun, false);
  3863. }
  3864. struct ggml_tensor * ggml_map_unary_inplace_f32(
  3865. struct ggml_context * ctx,
  3866. struct ggml_tensor * a,
  3867. const ggml_unary_op_f32_t fun) {
  3868. return ggml_map_unary_impl_f32(ctx, a, fun, true);
  3869. }
  3870. // ggml_map_binary
  3871. static struct ggml_tensor * ggml_map_binary_impl_f32(
  3872. struct ggml_context * ctx,
  3873. struct ggml_tensor * a,
  3874. struct ggml_tensor * b,
  3875. const ggml_binary_op_f32_t fun,
  3876. bool inplace) {
  3877. GGML_ASSERT(ggml_are_same_shape(a, b));
  3878. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3879. ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
  3880. result->op = GGML_OP_MAP_BINARY;
  3881. result->src[0] = a;
  3882. result->src[1] = b;
  3883. return result;
  3884. }
  3885. struct ggml_tensor * ggml_map_binary_f32(
  3886. struct ggml_context * ctx,
  3887. struct ggml_tensor * a,
  3888. struct ggml_tensor * b,
  3889. const ggml_binary_op_f32_t fun) {
  3890. return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
  3891. }
  3892. struct ggml_tensor * ggml_map_binary_inplace_f32(
  3893. struct ggml_context * ctx,
  3894. struct ggml_tensor * a,
  3895. struct ggml_tensor * b,
  3896. const ggml_binary_op_f32_t fun) {
  3897. return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
  3898. }
  3899. // ggml_map_custom1_f32
  3900. static struct ggml_tensor * ggml_map_custom1_impl_f32(
  3901. struct ggml_context * ctx,
  3902. struct ggml_tensor * a,
  3903. const ggml_custom1_op_f32_t fun,
  3904. bool inplace) {
  3905. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3906. ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
  3907. result->op = GGML_OP_MAP_CUSTOM1_F32;
  3908. result->src[0] = a;
  3909. return result;
  3910. }
  3911. struct ggml_tensor * ggml_map_custom1_f32(
  3912. struct ggml_context * ctx,
  3913. struct ggml_tensor * a,
  3914. const ggml_custom1_op_f32_t fun) {
  3915. return ggml_map_custom1_impl_f32(ctx, a, fun, false);
  3916. }
  3917. struct ggml_tensor * ggml_map_custom1_inplace_f32(
  3918. struct ggml_context * ctx,
  3919. struct ggml_tensor * a,
  3920. const ggml_custom1_op_f32_t fun) {
  3921. return ggml_map_custom1_impl_f32(ctx, a, fun, true);
  3922. }
  3923. // ggml_map_custom2_f32
  3924. static struct ggml_tensor * ggml_map_custom2_impl_f32(
  3925. struct ggml_context * ctx,
  3926. struct ggml_tensor * a,
  3927. struct ggml_tensor * b,
  3928. const ggml_custom2_op_f32_t fun,
  3929. bool inplace) {
  3930. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3931. ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
  3932. result->op = GGML_OP_MAP_CUSTOM2_F32;
  3933. result->src[0] = a;
  3934. result->src[1] = b;
  3935. return result;
  3936. }
  3937. struct ggml_tensor * ggml_map_custom2_f32(
  3938. struct ggml_context * ctx,
  3939. struct ggml_tensor * a,
  3940. struct ggml_tensor * b,
  3941. const ggml_custom2_op_f32_t fun) {
  3942. return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
  3943. }
  3944. struct ggml_tensor * ggml_map_custom2_inplace_f32(
  3945. struct ggml_context * ctx,
  3946. struct ggml_tensor * a,
  3947. struct ggml_tensor * b,
  3948. const ggml_custom2_op_f32_t fun) {
  3949. return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
  3950. }
  3951. // ggml_map_custom3_f32
  3952. static struct ggml_tensor * ggml_map_custom3_impl_f32(
  3953. struct ggml_context * ctx,
  3954. struct ggml_tensor * a,
  3955. struct ggml_tensor * b,
  3956. struct ggml_tensor * c,
  3957. const ggml_custom3_op_f32_t fun,
  3958. bool inplace) {
  3959. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3960. ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
  3961. result->op = GGML_OP_MAP_CUSTOM3_F32;
  3962. result->src[0] = a;
  3963. result->src[1] = b;
  3964. result->src[2] = c;
  3965. return result;
  3966. }
  3967. struct ggml_tensor * ggml_map_custom3_f32(
  3968. struct ggml_context * ctx,
  3969. struct ggml_tensor * a,
  3970. struct ggml_tensor * b,
  3971. struct ggml_tensor * c,
  3972. const ggml_custom3_op_f32_t fun) {
  3973. return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
  3974. }
  3975. struct ggml_tensor * ggml_map_custom3_inplace_f32(
  3976. struct ggml_context * ctx,
  3977. struct ggml_tensor * a,
  3978. struct ggml_tensor * b,
  3979. struct ggml_tensor * c,
  3980. const ggml_custom3_op_f32_t fun) {
  3981. return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
  3982. }
  3983. // ggml_map_custom1
  3984. static struct ggml_tensor * ggml_map_custom1_impl(
  3985. struct ggml_context * ctx,
  3986. struct ggml_tensor * a,
  3987. const ggml_custom1_op_t fun,
  3988. int n_tasks,
  3989. void * userdata,
  3990. bool inplace) {
  3991. GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
  3992. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  3993. struct ggml_map_custom1_op_params params = {
  3994. /*.fun =*/ fun,
  3995. /*.n_tasks =*/ n_tasks,
  3996. /*.userdata =*/ userdata
  3997. };
  3998. ggml_set_op_params(result, (const void *) &params, sizeof(params));
  3999. result->op = GGML_OP_MAP_CUSTOM1;
  4000. result->src[0] = a;
  4001. return result;
  4002. }
  4003. struct ggml_tensor * ggml_map_custom1(
  4004. struct ggml_context * ctx,
  4005. struct ggml_tensor * a,
  4006. const ggml_custom1_op_t fun,
  4007. int n_tasks,
  4008. void * userdata) {
  4009. return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
  4010. }
  4011. struct ggml_tensor * ggml_map_custom1_inplace(
  4012. struct ggml_context * ctx,
  4013. struct ggml_tensor * a,
  4014. const ggml_custom1_op_t fun,
  4015. int n_tasks,
  4016. void * userdata) {
  4017. return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
  4018. }
  4019. // ggml_map_custom2
  4020. static struct ggml_tensor * ggml_map_custom2_impl(
  4021. struct ggml_context * ctx,
  4022. struct ggml_tensor * a,
  4023. struct ggml_tensor * b,
  4024. const ggml_custom2_op_t fun,
  4025. int n_tasks,
  4026. void * userdata,
  4027. bool inplace) {
  4028. GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
  4029. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  4030. struct ggml_map_custom2_op_params params = {
  4031. /*.fun =*/ fun,
  4032. /*.n_tasks =*/ n_tasks,
  4033. /*.userdata =*/ userdata
  4034. };
  4035. ggml_set_op_params(result, (const void *) &params, sizeof(params));
  4036. result->op = GGML_OP_MAP_CUSTOM2;
  4037. result->src[0] = a;
  4038. result->src[1] = b;
  4039. return result;
  4040. }
  4041. struct ggml_tensor * ggml_map_custom2(
  4042. struct ggml_context * ctx,
  4043. struct ggml_tensor * a,
  4044. struct ggml_tensor * b,
  4045. const ggml_custom2_op_t fun,
  4046. int n_tasks,
  4047. void * userdata) {
  4048. return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
  4049. }
  4050. struct ggml_tensor * ggml_map_custom2_inplace(
  4051. struct ggml_context * ctx,
  4052. struct ggml_tensor * a,
  4053. struct ggml_tensor * b,
  4054. const ggml_custom2_op_t fun,
  4055. int n_tasks,
  4056. void * userdata) {
  4057. return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
  4058. }
  4059. // ggml_map_custom3
  4060. static struct ggml_tensor * ggml_map_custom3_impl(
  4061. struct ggml_context * ctx,
  4062. struct ggml_tensor * a,
  4063. struct ggml_tensor * b,
  4064. struct ggml_tensor * c,
  4065. const ggml_custom3_op_t fun,
  4066. int n_tasks,
  4067. void * userdata,
  4068. bool inplace) {
  4069. GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
  4070. struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
  4071. struct ggml_map_custom3_op_params params = {
  4072. /*.fun =*/ fun,
  4073. /*.n_tasks =*/ n_tasks,
  4074. /*.userdata =*/ userdata
  4075. };
  4076. ggml_set_op_params(result, (const void *) &params, sizeof(params));
  4077. result->op = GGML_OP_MAP_CUSTOM3;
  4078. result->src[0] = a;
  4079. result->src[1] = b;
  4080. result->src[2] = c;
  4081. return result;
  4082. }
  4083. struct ggml_tensor * ggml_map_custom3(
  4084. struct ggml_context * ctx,
  4085. struct ggml_tensor * a,
  4086. struct ggml_tensor * b,
  4087. struct ggml_tensor * c,
  4088. const ggml_custom3_op_t fun,
  4089. int n_tasks,
  4090. void * userdata) {
  4091. return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
  4092. }
  4093. struct ggml_tensor * ggml_map_custom3_inplace(
  4094. struct ggml_context * ctx,
  4095. struct ggml_tensor * a,
  4096. struct ggml_tensor * b,
  4097. struct ggml_tensor * c,
  4098. const ggml_custom3_op_t fun,
  4099. int n_tasks,
  4100. void * userdata) {
  4101. return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
  4102. }
  4103. // ggml_cross_entropy_loss
  4104. struct ggml_tensor * ggml_cross_entropy_loss(
  4105. struct ggml_context * ctx,
  4106. struct ggml_tensor * a,
  4107. struct ggml_tensor * b) {
  4108. GGML_ASSERT(ggml_are_same_shape(a, b));
  4109. struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
  4110. result->op = GGML_OP_CROSS_ENTROPY_LOSS;
  4111. result->src[0] = a;
  4112. result->src[1] = b;
  4113. return result;
  4114. }
  4115. // ggml_cross_entropy_loss_back
  4116. struct ggml_tensor * ggml_cross_entropy_loss_back(
  4117. struct ggml_context * ctx,
  4118. struct ggml_tensor * a,
  4119. struct ggml_tensor * b,
  4120. struct ggml_tensor * c) {
  4121. GGML_ASSERT(ggml_are_same_shape(a, b));
  4122. GGML_ASSERT(ggml_is_scalar(c));
  4123. struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
  4124. result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
  4125. result->src[0] = a;
  4126. result->src[1] = b;
  4127. result->src[2] = c;
  4128. return result;
  4129. }
  4130. // opt_step_adamw
  4131. struct ggml_tensor * ggml_opt_step_adamw(
  4132. struct ggml_context * ctx,
  4133. struct ggml_tensor * a,
  4134. struct ggml_tensor * grad,
  4135. struct ggml_tensor * m,
  4136. struct ggml_tensor * v,
  4137. struct ggml_tensor * adamw_params) {
  4138. GGML_ASSERT(a->flags & GGML_TENSOR_FLAG_PARAM);
  4139. GGML_ASSERT(ggml_are_same_shape(a, grad));
  4140. GGML_ASSERT(ggml_are_same_shape(a, m));
  4141. GGML_ASSERT(ggml_are_same_shape(a, v));
  4142. GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
  4143. GGML_ASSERT(ggml_nelements(adamw_params) == 7);
  4144. struct ggml_tensor * result = ggml_view_tensor(ctx, a);
  4145. result->op = GGML_OP_OPT_STEP_ADAMW;
  4146. result->src[0] = a;
  4147. result->src[1] = grad;
  4148. result->src[2] = m;
  4149. result->src[3] = v;
  4150. result->src[4] = adamw_params;
  4151. return result;
  4152. }
  4153. ////////////////////////////////////////////////////////////////////////////////
  4154. struct ggml_hash_set ggml_hash_set_new(size_t size) {
  4155. size = ggml_hash_size(size);
  4156. struct ggml_hash_set result;
  4157. result.size = size;
  4158. result.keys = GGML_MALLOC(sizeof(struct ggml_tensor *) * size);
  4159. result.used = GGML_CALLOC(ggml_bitset_size(size), sizeof(ggml_bitset_t));
  4160. return result;
  4161. }
  4162. void ggml_hash_set_reset(struct ggml_hash_set * hash_set) {
  4163. memset(hash_set->used, 0, sizeof(ggml_bitset_t) * ggml_bitset_size(hash_set->size));
  4164. }
  4165. void ggml_hash_set_free(struct ggml_hash_set * hash_set) {
  4166. GGML_FREE(hash_set->used);
  4167. GGML_FREE(hash_set->keys);
  4168. }
  4169. size_t ggml_hash_size(size_t min_sz) {
  4170. // next primes after powers of two
  4171. static const size_t primes[] = {
  4172. 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
  4173. 2053, 4099, 8209, 16411, 32771, 65537, 131101,
  4174. 262147, 524309, 1048583, 2097169, 4194319, 8388617,
  4175. 16777259, 33554467, 67108879, 134217757, 268435459,
  4176. 536870923, 1073741827, 2147483659
  4177. };
  4178. static const size_t n_primes = sizeof(primes)/sizeof(primes[0]);
  4179. // find the smallest prime that is larger or equal than min_sz
  4180. size_t l = 0;
  4181. size_t r = n_primes;
  4182. while (l < r) {
  4183. size_t m = (l + r)/2;
  4184. if (primes[m] < min_sz) {
  4185. l = m + 1;
  4186. } else {
  4187. r = m;
  4188. }
  4189. }
  4190. size_t sz = l < n_primes ? primes[l] : min_sz | 1;
  4191. return sz;
  4192. }
  4193. struct hash_map {
  4194. struct ggml_hash_set set;
  4195. struct ggml_tensor ** vals;
  4196. };
  4197. static struct hash_map * ggml_new_hash_map(size_t size) {
  4198. struct hash_map * result = GGML_MALLOC(sizeof(struct hash_map));
  4199. result->set = ggml_hash_set_new(size);
  4200. result->vals = GGML_CALLOC(result->set.size, sizeof(struct ggml_tensor *));
  4201. return result;
  4202. }
  4203. static void ggml_hash_map_free(struct hash_map * map) {
  4204. ggml_hash_set_free(&map->set);
  4205. GGML_FREE(map->vals);
  4206. GGML_FREE(map);
  4207. }
  4208. // utility functions to change gradients
  4209. // isrc is the index of tensor in cgraph->visited_has_set.keys
  4210. // the corresponding gradient (accumulators) are also at position isrc
  4211. // if tensor has a gradient accumulator, modify that accumulator in-place
  4212. // else if there is no gradient for tensor, set the corresponding value
  4213. // else, just add/subtract/etc. the gradients
  4214. static void ggml_add_or_set(
  4215. struct ggml_context * ctx,
  4216. struct ggml_cgraph * cgraph,
  4217. size_t isrc,
  4218. struct ggml_tensor * tensor) {
  4219. struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
  4220. GGML_ASSERT(src);
  4221. if (cgraph->grads[isrc]) {
  4222. cgraph->grads[isrc] = ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
  4223. } else {
  4224. cgraph->grads[isrc] = tensor;
  4225. }
  4226. ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
  4227. ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
  4228. }
  4229. static void ggml_acc_or_set(
  4230. struct ggml_context * ctx,
  4231. struct ggml_cgraph * cgraph,
  4232. size_t isrc,
  4233. struct ggml_tensor * tensor,
  4234. const size_t nb1,
  4235. const size_t nb2,
  4236. const size_t nb3,
  4237. const size_t offset) {
  4238. struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
  4239. GGML_ASSERT(src);
  4240. if (cgraph->grads[isrc]) {
  4241. cgraph->grads[isrc] = ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
  4242. } else {
  4243. struct ggml_tensor * a_zero = ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
  4244. cgraph->grads[isrc] = ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
  4245. }
  4246. ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
  4247. ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
  4248. }
  4249. static void ggml_add1_or_set(
  4250. struct ggml_context * ctx,
  4251. struct ggml_cgraph * cgraph,
  4252. size_t isrc,
  4253. struct ggml_tensor * tensor) {
  4254. struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
  4255. GGML_ASSERT(src);
  4256. if (cgraph->grads[isrc]) {
  4257. cgraph->grads[isrc] = ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
  4258. } else {
  4259. cgraph->grads[isrc] = ggml_repeat(ctx, tensor, src);
  4260. }
  4261. ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
  4262. ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
  4263. }
  4264. static void ggml_sub_or_set(
  4265. struct ggml_context * ctx,
  4266. struct ggml_cgraph * cgraph,
  4267. size_t isrc,
  4268. struct ggml_tensor * tensor) {
  4269. struct ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
  4270. GGML_ASSERT(src);
  4271. if (cgraph->grads[isrc]) {
  4272. cgraph->grads[isrc] = ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
  4273. } else {
  4274. cgraph->grads[isrc] = ggml_neg(ctx, tensor);
  4275. }
  4276. ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
  4277. ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
  4278. }
  4279. static void ggml_compute_backward(
  4280. struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) {
  4281. struct ggml_tensor * tensor = cgraph->nodes[i];
  4282. struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor);
  4283. if (!grad) {
  4284. return;
  4285. }
  4286. struct ggml_tensor * src0 = tensor->src[0];
  4287. struct ggml_tensor * src1 = tensor->src[1];
  4288. struct ggml_tensor * src2 = tensor->src[2];
  4289. struct ggml_hash_set * hash_set = &cgraph->visited_hash_set;
  4290. const size_t isrc0 = src0 ? ggml_hash_find(hash_set, src0) : (size_t) -1;
  4291. const size_t isrc1 = src1 ? ggml_hash_find(hash_set, src1) : (size_t) -1;
  4292. const size_t isrc2 = src2 ? ggml_hash_find(hash_set, src2) : (size_t) -1;
  4293. const bool src0_needs_grads = src0 && isrc0 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
  4294. const bool src1_needs_grads = src1 && isrc1 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
  4295. const bool src2_needs_grads = src2 && isrc2 != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
  4296. switch (tensor->op) {
  4297. case GGML_OP_DUP: {
  4298. if (src0_needs_grads) {
  4299. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4300. }
  4301. } break;
  4302. case GGML_OP_ADD: {
  4303. if (src0_needs_grads) {
  4304. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4305. }
  4306. if (src1_needs_grads) {
  4307. struct ggml_tensor * tmp = grad;
  4308. if (!ggml_are_same_shape(src0, src1)) {
  4309. tmp = ggml_repeat_back(ctx, tmp, src1);
  4310. }
  4311. ggml_add_or_set(ctx, cgraph, isrc1, tmp);
  4312. }
  4313. } break;
  4314. case GGML_OP_ADD1: {
  4315. if (src0_needs_grads) {
  4316. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4317. }
  4318. if (src1_needs_grads) {
  4319. ggml_add_or_set(ctx, cgraph, isrc1, ggml_mean(ctx, grad)); // TODO: should probably be sum instead of mean
  4320. }
  4321. } break;
  4322. case GGML_OP_ACC: {
  4323. if (src0_needs_grads) {
  4324. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4325. }
  4326. if (src1_needs_grads) {
  4327. const size_t nb1 = ((int32_t *) tensor->op_params)[0];
  4328. const size_t nb2 = ((int32_t *) tensor->op_params)[1];
  4329. const size_t nb3 = ((int32_t *) tensor->op_params)[2];
  4330. const size_t offset = ((int32_t *) tensor->op_params)[3];
  4331. struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx,
  4332. grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
  4333. nb1, nb2, nb3, offset);
  4334. ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
  4335. }
  4336. } break;
  4337. case GGML_OP_SUB: {
  4338. if (src0_needs_grads) {
  4339. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4340. }
  4341. if (src1_needs_grads) {
  4342. ggml_sub_or_set(ctx, cgraph, isrc1, grad);
  4343. }
  4344. } break;
  4345. case GGML_OP_MUL: {
  4346. if (src0_needs_grads) {
  4347. ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
  4348. }
  4349. if (src1_needs_grads) {
  4350. struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
  4351. if (!ggml_are_same_shape(src0, src1)) {
  4352. tmp = ggml_repeat_back(ctx, tmp, src1);
  4353. }
  4354. ggml_add_or_set(ctx, cgraph, isrc1, tmp);
  4355. }
  4356. } break;
  4357. case GGML_OP_DIV: {
  4358. if (src0_needs_grads) {
  4359. ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src1));
  4360. }
  4361. if (src1_needs_grads) {
  4362. ggml_sub_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, grad, ggml_div(ctx, tensor, src1)));
  4363. }
  4364. } break;
  4365. case GGML_OP_SQR: {
  4366. if (src0_needs_grads) {
  4367. ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_mul(ctx, src0, grad), 2.0f));
  4368. }
  4369. } break;
  4370. case GGML_OP_SQRT: {
  4371. if (src0_needs_grads) {
  4372. ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale(ctx, ggml_div(ctx, grad, tensor), 0.5f));
  4373. }
  4374. } break;
  4375. case GGML_OP_LOG: {
  4376. if (src0_needs_grads) {
  4377. ggml_add_or_set(ctx, cgraph, isrc0, ggml_div(ctx, grad, src0));
  4378. }
  4379. } break;
  4380. case GGML_OP_SIN: {
  4381. if (src0_needs_grads) {
  4382. ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_cos(ctx, src0)));
  4383. }
  4384. } break;
  4385. case GGML_OP_COS: {
  4386. if (src0_needs_grads) {
  4387. ggml_sub_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_sin(ctx, src0)));
  4388. }
  4389. } break;
  4390. case GGML_OP_SUM: {
  4391. if (src0_needs_grads) {
  4392. ggml_add1_or_set(ctx, cgraph, isrc0, grad);
  4393. }
  4394. } break;
  4395. case GGML_OP_SUM_ROWS: {
  4396. if (src0_needs_grads) {
  4397. ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
  4398. }
  4399. } break;
  4400. case GGML_OP_MEAN: {
  4401. if (src0_needs_grads) {
  4402. ggml_add1_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
  4403. }
  4404. } break;
  4405. case GGML_OP_REPEAT: {
  4406. if (src0_needs_grads) {
  4407. ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat_back(ctx, grad, src0));
  4408. }
  4409. } break;
  4410. case GGML_OP_REPEAT_BACK: {
  4411. if (src0_needs_grads) {
  4412. ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, grad, src0));
  4413. }
  4414. } break;
  4415. case GGML_OP_RMS_NORM: {
  4416. if (src0_needs_grads) {
  4417. float eps;
  4418. memcpy(&eps, tensor->op_params, sizeof(float));
  4419. ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps));
  4420. }
  4421. } break;
  4422. case GGML_OP_MUL_MAT: {
  4423. // https://cs231n.github.io/optimization-2/#staged
  4424. // # forward pass
  4425. // s0 = np.random.randn(5, 10)
  4426. // s1 = np.random.randn(10, 3)
  4427. // t = s0.dot(s1)
  4428. // # now suppose we had the gradient on t from above in the circuit
  4429. // dt = np.random.randn(*t.shape) # same shape as t
  4430. // ds0 = dt.dot(s1.T) #.T gives the transpose of the matrix
  4431. // ds1 = t.T.dot(dt)
  4432. // tensor.shape [m,p,qq,rr]
  4433. // src0.shape [n,m,q1,r1]
  4434. // src1.shape [n,p,qq,rr]
  4435. if (src0_needs_grads) {
  4436. struct ggml_tensor * s1_tg =
  4437. ggml_out_prod(ctx, // [n,m,qq,rr]
  4438. src1, // [n,p,qq,rr]
  4439. grad); // [m,p,qq,rr]
  4440. const int64_t qq = s1_tg->ne[2];
  4441. const int64_t rr = s1_tg->ne[3];
  4442. const int64_t q1 = src0->ne[2];
  4443. const int64_t r1 = src0->ne[3];
  4444. const bool ne2_broadcasted = qq > q1;
  4445. const bool ne3_broadcasted = rr > r1;
  4446. if (ne2_broadcasted || ne3_broadcasted) {
  4447. // sum broadcast repetitions of s1_tg into shape of src0
  4448. s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
  4449. }
  4450. ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
  4451. }
  4452. if (src1_needs_grads) {
  4453. ggml_add_or_set(ctx, cgraph, isrc1,
  4454. // ggml_mul_mat(ctx, // [n,p,qq,rr]
  4455. // ggml_cont(ctx, // [m,n,q1,r1]
  4456. // ggml_transpose(ctx, src0)), // [m,n,q1,r1]
  4457. // grad), // [m,p,qq,rr]
  4458. // when src0 is bigger than tensor->grad (this is mostly the case in llama),
  4459. // avoid transpose of src0, rather transpose smaller tensor->grad
  4460. // and then use ggml_out_prod
  4461. ggml_out_prod(ctx, // [n,p,qq,rr]
  4462. src0, // [n,m,q1,r1]
  4463. ggml_transpose(ctx, // [p,m,qq,rr]
  4464. grad))); // [m,p,qq,rr]
  4465. }
  4466. } break;
  4467. case GGML_OP_SCALE: {
  4468. if (src0_needs_grads) {
  4469. float s;
  4470. memcpy(&s, tensor->op_params, sizeof(float));
  4471. ggml_add_or_set(ctx, cgraph, isrc0, ggml_scale_impl(ctx, grad, s, false));
  4472. }
  4473. } break;
  4474. case GGML_OP_SET: {
  4475. const size_t nb1 = ((const int32_t *) tensor->op_params)[0];
  4476. const size_t nb2 = ((const int32_t *) tensor->op_params)[1];
  4477. const size_t nb3 = ((const int32_t *) tensor->op_params)[2];
  4478. const size_t offset = ((const int32_t *) tensor->op_params)[3];
  4479. struct ggml_tensor * tensor_grad_view = NULL;
  4480. if (src0_needs_grads || src1_needs_grads) {
  4481. GGML_ASSERT(src0->type == tensor->type);
  4482. GGML_ASSERT(!cgraph->grads[isrc0] || cgraph->grads[isrc0]->type == grad->type);
  4483. GGML_ASSERT(!cgraph->grads[isrc1] || !src1_needs_grads || cgraph->grads[isrc1]->type == grad->type);
  4484. tensor_grad_view = ggml_view_4d(ctx,
  4485. grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
  4486. nb1, nb2, nb3, offset);
  4487. }
  4488. if (src0_needs_grads) {
  4489. struct ggml_tensor * tmp = ggml_neg(ctx, tensor_grad_view);
  4490. ggml_add_or_set(ctx, cgraph, isrc0, ggml_acc_impl(ctx, grad, tmp, nb1, nb2, nb3, offset, false));
  4491. }
  4492. if (src1_needs_grads) {
  4493. ggml_add_or_set(ctx, cgraph, isrc1, ggml_reshape(ctx, ggml_cont(ctx, tensor_grad_view), src1));
  4494. }
  4495. } break;
  4496. case GGML_OP_CPY: {
  4497. // cpy overwrites value of src1 by src0 and returns view(src1)
  4498. // the overwriting is mathematically equivalent to:
  4499. // tensor = src0 * 1 + src1 * 0
  4500. if (src0_needs_grads) {
  4501. // dsrc0 = dtensor * 1
  4502. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4503. }
  4504. if (src1_needs_grads) {
  4505. // dsrc1 = dtensor * 0 -> noop
  4506. }
  4507. } break;
  4508. case GGML_OP_CONT: {
  4509. // same as cpy
  4510. if (src0_needs_grads) {
  4511. GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
  4512. GGML_ASSERT(ggml_is_contiguous(grad));
  4513. ggml_add_or_set(ctx, cgraph, isrc0, grad);
  4514. }
  4515. } break;
  4516. case GGML_OP_RESHAPE: {
  4517. if (src0_needs_grads) {
  4518. struct ggml_tensor * grad_cont = ggml_is_contiguous(grad) ? grad : ggml_cont(ctx, grad);
  4519. ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad_cont, src0));
  4520. }
  4521. } break;
  4522. case GGML_OP_VIEW: {
  4523. if (src0_needs_grads) {
  4524. size_t offset;
  4525. memcpy(&offset, tensor->op_params, sizeof(offset));
  4526. size_t nb1 = tensor->nb[1];
  4527. size_t nb2 = tensor->nb[2];
  4528. size_t nb3 = tensor->nb[3];
  4529. if (cgraph->grads[isrc0] && src0->type != cgraph->grads[isrc0]->type) {
  4530. // gradient is typically F32, but src0 could be other type
  4531. size_t ng = ggml_element_size(cgraph->grads[isrc0]);
  4532. size_t n0 = ggml_element_size(src0);
  4533. GGML_ASSERT(offset % n0 == 0);
  4534. GGML_ASSERT(nb1 % n0 == 0);
  4535. GGML_ASSERT(nb2 % n0 == 0);
  4536. GGML_ASSERT(nb3 % n0 == 0);
  4537. offset = (offset / n0) * ng;
  4538. nb1 = (nb1 / n0) * ng;
  4539. nb2 = (nb2 / n0) * ng;
  4540. nb3 = (nb3 / n0) * ng;
  4541. }
  4542. ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
  4543. }
  4544. } break;
  4545. case GGML_OP_PERMUTE: {
  4546. if (src0_needs_grads) {
  4547. const int32_t * axes = (const int32_t *) tensor->op_params;
  4548. const int axis0 = axes[0] & 0x3;
  4549. const int axis1 = axes[1] & 0x3;
  4550. const int axis2 = axes[2] & 0x3;
  4551. const int axis3 = axes[3] & 0x3;
  4552. int axb[4] = {0,0,0,0}; // axes backward
  4553. axb[axis0] = 0;
  4554. axb[axis1] = 1;
  4555. axb[axis2] = 2;
  4556. axb[axis3] = 3;
  4557. ggml_add_or_set(ctx, cgraph, isrc0, ggml_permute(ctx, grad, axb[0], axb[1], axb[2], axb[3]));
  4558. }
  4559. } break;
  4560. case GGML_OP_TRANSPOSE: {
  4561. if (src0_needs_grads) {
  4562. ggml_add_or_set(ctx, cgraph, isrc0, ggml_transpose(ctx, grad));
  4563. }
  4564. } break;
  4565. case GGML_OP_GET_ROWS: {
  4566. if (src0_needs_grads) {
  4567. ggml_add_or_set(ctx, cgraph, isrc0, ggml_get_rows_back(ctx, grad, src1, src0));
  4568. }
  4569. if (src1_needs_grads) {
  4570. // noop
  4571. }
  4572. } break;
  4573. case GGML_OP_DIAG_MASK_INF: {
  4574. if (src0_needs_grads) {
  4575. /* ggml_diag_mask_inf_impl() shouldn't be here */
  4576. /* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
  4577. const int n_past = ((const int32_t *) tensor->op_params)[0];
  4578. ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
  4579. }
  4580. } break;
  4581. case GGML_OP_DIAG_MASK_ZERO: {
  4582. if (src0_needs_grads) {
  4583. const int n_past = ((const int32_t *) tensor->op_params)[0];
  4584. ggml_add_or_set(ctx, cgraph, isrc0, ggml_diag_mask_zero_impl(ctx, grad, n_past, false));
  4585. }
  4586. } break;
  4587. case GGML_OP_SOFT_MAX: {
  4588. if (src0_needs_grads) {
  4589. ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor));
  4590. }
  4591. GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
  4592. } break;
  4593. case GGML_OP_ROPE: {
  4594. if (src0_needs_grads) {
  4595. //const int n_past = ((int32_t *) tensor->op_params)[0];
  4596. const int n_dims = ((const int32_t *) tensor->op_params)[1];
  4597. const int mode = ((const int32_t *) tensor->op_params)[2];
  4598. //const int n_ctx = ((int32_t *) tensor->op_params)[3];
  4599. const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
  4600. float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
  4601. memcpy(&freq_base, (const float *) tensor->op_params + 5, sizeof(float));
  4602. memcpy(&freq_scale, (const float *) tensor->op_params + 6, sizeof(float));
  4603. memcpy(&ext_factor, (const float *) tensor->op_params + 7, sizeof(float));
  4604. memcpy(&attn_factor, (const float *) tensor->op_params + 8, sizeof(float));
  4605. memcpy(&beta_fast, (const float *) tensor->op_params + 9, sizeof(float));
  4606. memcpy(&beta_slow, (const float *) tensor->op_params + 10, sizeof(float));
  4607. ggml_add_or_set(ctx, cgraph, isrc0,
  4608. ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
  4609. freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
  4610. }
  4611. GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
  4612. } break;
  4613. case GGML_OP_IM2COL: {
  4614. if (src1_needs_grads) {
  4615. const int32_t s0 = ggml_get_op_params_i32(tensor, 0);
  4616. const int32_t s1 = ggml_get_op_params_i32(tensor, 1);
  4617. const int32_t p0 = ggml_get_op_params_i32(tensor, 2);
  4618. const int32_t p1 = ggml_get_op_params_i32(tensor, 3);
  4619. const int32_t d0 = ggml_get_op_params_i32(tensor, 4);
  4620. const int32_t d1 = ggml_get_op_params_i32(tensor, 5);
  4621. const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
  4622. ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
  4623. }
  4624. } break;
  4625. case GGML_OP_POOL_2D: {
  4626. if (src0_needs_grads) {
  4627. const enum ggml_op_pool op = ggml_get_op_params_i32(tensor, 0);
  4628. const int32_t k0 = ggml_get_op_params_i32(tensor, 1);
  4629. const int32_t k1 = ggml_get_op_params_i32(tensor, 2);
  4630. const int32_t s0 = ggml_get_op_params_i32(tensor, 3);
  4631. const int32_t s1 = ggml_get_op_params_i32(tensor, 4);
  4632. const int32_t p0 = ggml_get_op_params_i32(tensor, 5);
  4633. const int32_t p1 = ggml_get_op_params_i32(tensor, 6);
  4634. ggml_add_or_set(ctx, cgraph, isrc0, ggml_pool_2d_back(ctx, grad, src0, op, k0, k1, s0, s1, p0, p1));
  4635. }
  4636. } break;
  4637. case GGML_OP_WIN_PART:
  4638. case GGML_OP_WIN_UNPART:
  4639. case GGML_OP_UNARY: {
  4640. switch (ggml_get_unary_op(tensor)) {
  4641. case GGML_UNARY_OP_ABS: {
  4642. if (src0_needs_grads) {
  4643. ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_sgn(ctx, src0), grad));
  4644. }
  4645. } break;
  4646. case GGML_UNARY_OP_SGN: {
  4647. // noop
  4648. } break;
  4649. case GGML_UNARY_OP_NEG: {
  4650. if (src0_needs_grads) {
  4651. ggml_sub_or_set(ctx, cgraph, isrc0, grad);
  4652. }
  4653. } break;
  4654. case GGML_UNARY_OP_STEP: {
  4655. // noop
  4656. } break;
  4657. case GGML_UNARY_OP_RELU: {
  4658. if (src0_needs_grads) {
  4659. ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, ggml_step(ctx, src0), grad));
  4660. }
  4661. } break;
  4662. case GGML_UNARY_OP_SILU: {
  4663. if (src0_needs_grads) {
  4664. ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad));
  4665. }
  4666. } break;
  4667. case GGML_UNARY_OP_EXP: {
  4668. if (src0_needs_grads) {
  4669. ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad));
  4670. }
  4671. } break;
  4672. default: {
  4673. fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n",
  4674. __func__, ggml_unary_op_name(ggml_get_unary_op(tensor)));
  4675. GGML_ABORT("fatal error");
  4676. } //break;
  4677. }
  4678. } break;
  4679. case GGML_OP_CROSS_ENTROPY_LOSS: {
  4680. if (src0_needs_grads) {
  4681. ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
  4682. }
  4683. GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
  4684. } break;
  4685. case GGML_OP_NONE: {
  4686. // noop
  4687. } break;
  4688. case GGML_OP_COUNT:
  4689. default: {
  4690. fprintf(stderr, "%s: unsupported ggml op for backward pass: %s\n", __func__, ggml_op_name(tensor->op));
  4691. GGML_ABORT("fatal error");
  4692. } //break;
  4693. }
  4694. GGML_ASSERT(!src0_needs_grads || ggml_are_same_shape(src0, cgraph->grads[isrc0]));
  4695. GGML_ASSERT(!src1_needs_grads || ggml_are_same_shape(src1, cgraph->grads[isrc1]));
  4696. GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
  4697. }
  4698. static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
  4699. // check if already visited
  4700. if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
  4701. return;
  4702. }
  4703. for (int i = 0; i < GGML_MAX_SRC; ++i) {
  4704. const int k =
  4705. (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
  4706. (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
  4707. /* unknown order, just fall back to using i*/ i;
  4708. if (node->src[k]) {
  4709. ggml_visit_parents(cgraph, node->src[k]);
  4710. }
  4711. }
  4712. if (node->op == GGML_OP_NONE && !(node->flags & GGML_TENSOR_FLAG_PARAM)) {
  4713. // reached a leaf node, not part of the gradient graph (e.g. a constant)
  4714. GGML_ASSERT(cgraph->n_leafs < cgraph->size);
  4715. if (strlen(node->name) == 0) {
  4716. ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
  4717. }
  4718. cgraph->leafs[cgraph->n_leafs] = node;
  4719. cgraph->n_leafs++;
  4720. } else {
  4721. GGML_ASSERT(cgraph->n_nodes < cgraph->size);
  4722. if (strlen(node->name) == 0) {
  4723. ggml_format_name(node, "node_%d", cgraph->n_nodes);
  4724. }
  4725. cgraph->nodes[cgraph->n_nodes] = node;
  4726. cgraph->n_nodes++;
  4727. }
  4728. }
  4729. static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
  4730. if (!expand) {
  4731. // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
  4732. ggml_graph_clear(cgraph);
  4733. }
  4734. const int n0 = cgraph->n_nodes;
  4735. ggml_visit_parents(cgraph, tensor);
  4736. const int n_new = cgraph->n_nodes - n0;
  4737. GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
  4738. if (n_new > 0) {
  4739. // the last added node should always be starting point
  4740. GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
  4741. }
  4742. }
  4743. void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
  4744. ggml_build_forward_impl(cgraph, tensor, true);
  4745. }
  4746. void ggml_build_backward_expand(
  4747. struct ggml_context * ctx_static,
  4748. struct ggml_context * ctx_compute,
  4749. struct ggml_cgraph * cgraph,
  4750. bool accumulate) {
  4751. GGML_ASSERT(cgraph->n_nodes > 0);
  4752. GGML_ASSERT(cgraph->grads);
  4753. GGML_ASSERT(cgraph->grad_accs);
  4754. const int n_nodes_f = cgraph->n_nodes;
  4755. memset(cgraph->grads, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
  4756. memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct ggml_tensor *));
  4757. bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
  4758. {
  4759. bool any_params = false;
  4760. bool any_loss = false;
  4761. for (int i = 0; i < n_nodes_f; ++i) {
  4762. struct ggml_tensor * node = cgraph->nodes[i];
  4763. any_params = any_params || (node->flags & GGML_TENSOR_FLAG_PARAM);
  4764. any_loss = any_loss || (node->flags & GGML_TENSOR_FLAG_LOSS);
  4765. }
  4766. GGML_ASSERT(any_params && "no trainable parameters found, did you forget to call ggml_set_param?");
  4767. GGML_ASSERT(any_loss && "no training loss found, did you forget to call ggml_set_loss?");
  4768. }
  4769. for (int i = 0; i < n_nodes_f; ++i) {
  4770. struct ggml_tensor * node = cgraph->nodes[i];
  4771. if (node->type == GGML_TYPE_I32) {
  4772. continue;
  4773. }
  4774. bool node_needs_grad = (node->flags & GGML_TENSOR_FLAG_PARAM) || (node->flags & GGML_TENSOR_FLAG_LOSS);
  4775. bool ignore_src[GGML_MAX_SRC] = {false};
  4776. switch (node->op) {
  4777. // gradients in node->src[0] for one reason or another have no effect on output gradients
  4778. case GGML_OP_IM2COL: // only used for its shape
  4779. case GGML_OP_IM2COL_BACK: // same as IM2COL
  4780. ignore_src[0] = true;
  4781. break;
  4782. case GGML_OP_UNARY: {
  4783. const enum ggml_unary_op uop = ggml_get_unary_op(node);
  4784. // SGN and STEP unary ops are piecewise constant
  4785. if (uop == GGML_UNARY_OP_SGN || uop == GGML_UNARY_OP_STEP) {
  4786. ignore_src[0] = true;
  4787. }
  4788. } break;
  4789. // gradients in node->src[1] for one reason or another have no effect on output gradients
  4790. case GGML_OP_CPY: // gradients in CPY target are irrelevant
  4791. case GGML_OP_GET_ROWS: // row indices not differentiable
  4792. case GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
  4793. case GGML_OP_ROPE: // positions not differentiable
  4794. ignore_src[1] = true;
  4795. break;
  4796. default:
  4797. break;
  4798. }
  4799. for (int j = 0; j < GGML_MAX_SRC; ++j) {
  4800. if (!node->src[j] || ignore_src[j] || !grads_needed[ggml_hash_find(&cgraph->visited_hash_set, node->src[j])]) {
  4801. continue;
  4802. }
  4803. GGML_ASSERT(node->src[j]->type == GGML_TYPE_F32 || node->src[j]->type == GGML_TYPE_F16);
  4804. node_needs_grad = true;
  4805. break;
  4806. }
  4807. if (!node_needs_grad) {
  4808. continue;
  4809. }
  4810. // inplace operations are currently not supported
  4811. GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
  4812. node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);
  4813. const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
  4814. GGML_ASSERT(igrad != GGML_HASHSET_FULL);
  4815. GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
  4816. if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
  4817. cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
  4818. cgraph->grads[igrad] = cgraph->grad_accs[igrad];
  4819. ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
  4820. }
  4821. grads_needed[igrad] = true;
  4822. }
  4823. for (int i = n_nodes_f - 1; i >= 0; --i) {
  4824. // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
  4825. // use allocator to automatically make inplace operations
  4826. ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
  4827. }
  4828. free(grads_needed);
  4829. }
  4830. static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
  4831. void * ptr = *p;
  4832. ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
  4833. *p = (void *) ((char *) ptr + size);
  4834. return ptr;
  4835. }
  4836. static size_t ggml_graph_nbytes(size_t size, bool grads) {
  4837. size_t hash_size = ggml_hash_size(size * 2);
  4838. void * p = 0;
  4839. incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
  4840. incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
  4841. incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
  4842. incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
  4843. if (grads) {
  4844. incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
  4845. incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grad_accs
  4846. }
  4847. incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
  4848. size_t nbytes = (size_t) p;
  4849. return nbytes;
  4850. }
  4851. size_t ggml_graph_overhead_custom(size_t size, bool grads) {
  4852. return GGML_OBJECT_SIZE + GGML_PAD(ggml_graph_nbytes(size, grads), GGML_MEM_ALIGN);
  4853. }
  4854. size_t ggml_graph_overhead(void) {
  4855. return ggml_graph_overhead_custom(GGML_DEFAULT_GRAPH_SIZE, false);
  4856. }
  4857. struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) {
  4858. const size_t obj_size = ggml_graph_nbytes(size, grads);
  4859. struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size);
  4860. struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
  4861. // the size of the hash table is doubled since it needs to hold both nodes and leafs
  4862. size_t hash_size = ggml_hash_size(size * 2);
  4863. void * p = cgraph + 1;
  4864. struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
  4865. struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
  4866. struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
  4867. struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
  4868. struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
  4869. ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
  4870. // check that we allocated the correct amount of memory
  4871. assert(obj_size == (size_t)((char *)p - (char *)cgraph));
  4872. *cgraph = (struct ggml_cgraph) {
  4873. /*.size =*/ size,
  4874. /*.n_nodes =*/ 0,
  4875. /*.n_leafs =*/ 0,
  4876. /*.nodes =*/ nodes_ptr,
  4877. /*.grads =*/ grads_ptr,
  4878. /*.grad_accs =*/ grad_accs_ptr,
  4879. /*.leafs =*/ leafs_ptr,
  4880. /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
  4881. /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
  4882. };
  4883. ggml_hash_set_reset(&cgraph->visited_hash_set);
  4884. if (grads) {
  4885. memset(cgraph->grads, 0, hash_size*sizeof(struct ggml_tensor *));
  4886. memset(cgraph->grad_accs, 0, hash_size*sizeof(struct ggml_tensor *));
  4887. }
  4888. return cgraph;
  4889. }
  4890. struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
  4891. return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
  4892. }
  4893. struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
  4894. struct ggml_cgraph cgraph = {
  4895. /*.size =*/ 0,
  4896. /*.n_nodes =*/ i1 - i0,
  4897. /*.n_leafs =*/ 0,
  4898. /*.nodes =*/ cgraph0->nodes + i0,
  4899. /*.grads =*/ NULL, // gradients would need visited_hash_set
  4900. /*.grad_accs =*/ NULL,
  4901. /*.leafs =*/ NULL,
  4902. /*.visited_hash_set =*/ { 0, NULL, NULL },
  4903. /*.order =*/ cgraph0->order,
  4904. };
  4905. return cgraph;
  4906. }
  4907. void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
  4908. GGML_ASSERT(dst->size >= src->n_leafs);
  4909. GGML_ASSERT(dst->size >= src->n_nodes);
  4910. GGML_ASSERT(dst->visited_hash_set.size >= src->visited_hash_set.size);
  4911. dst->n_leafs = src->n_leafs;
  4912. dst->n_nodes = src->n_nodes;
  4913. dst->order = src->order;
  4914. for (int i = 0; i < src->n_leafs; ++i) {
  4915. dst->leafs[i] = src->leafs[i];
  4916. }
  4917. for (int i = 0; i < src->n_nodes; ++i) {
  4918. dst->nodes[i] = src->nodes[i];
  4919. }
  4920. for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
  4921. // copy all hashset keys (tensors) that are in use
  4922. if (ggml_bitset_get(src->visited_hash_set.used, i)) {
  4923. ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
  4924. }
  4925. }
  4926. if (dst->grads) {
  4927. memset(dst->grads, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
  4928. memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct ggml_tensor *));
  4929. }
  4930. if (src->grads) {
  4931. GGML_ASSERT(dst->grads != NULL);
  4932. GGML_ASSERT(dst->grad_accs != NULL);
  4933. for (int i = 0; i < src->n_nodes; ++i) {
  4934. const size_t igrad_src = ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
  4935. const size_t igrad_dst = ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
  4936. GGML_ASSERT(igrad_src != GGML_HASHSET_FULL);
  4937. GGML_ASSERT(ggml_bitset_get(src->visited_hash_set.used, igrad_src));
  4938. GGML_ASSERT(igrad_dst != GGML_HASHSET_FULL);
  4939. GGML_ASSERT(ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
  4940. dst->grads[igrad_dst] = src->grads[igrad_src];
  4941. dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
  4942. }
  4943. }
  4944. }
  4945. struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
  4946. struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
  4947. ggml_graph_cpy(cgraph, result);
  4948. return result;
  4949. }
  4950. struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
  4951. if (ggml_is_empty(tensor)) {
  4952. return tensor;
  4953. }
  4954. if (tensor->buffer) {
  4955. ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
  4956. } else {
  4957. GGML_ASSERT(tensor->data);
  4958. memset(tensor->data, 0, ggml_nbytes(tensor));
  4959. }
  4960. return tensor;
  4961. }
  4962. void ggml_graph_reset(struct ggml_cgraph * cgraph) {
  4963. GGML_ASSERT(cgraph->grads != NULL);
  4964. for (int i = 0; i < cgraph->n_nodes; i++) {
  4965. struct ggml_tensor * node = cgraph->nodes[i];
  4966. struct ggml_tensor * grad_acc = ggml_graph_get_grad_acc(cgraph, node);
  4967. if (node->op == GGML_OP_OPT_STEP_ADAMW) {
  4968. // clear momenta
  4969. ggml_set_zero(node->src[2]);
  4970. ggml_set_zero(node->src[3]);
  4971. }
  4972. // initial gradients of loss should be 1, 0 otherwise
  4973. if (grad_acc) {
  4974. if (node->flags & GGML_TENSOR_FLAG_LOSS) {
  4975. GGML_ASSERT(grad_acc->type == GGML_TYPE_F32);
  4976. GGML_ASSERT(ggml_is_scalar(grad_acc));
  4977. const float onef = 1.0f;
  4978. if (grad_acc->buffer) {
  4979. ggml_backend_tensor_set(grad_acc, &onef, 0, sizeof(float));
  4980. } else {
  4981. GGML_ASSERT(grad_acc->data);
  4982. *((float *) grad_acc->data) = onef;
  4983. }
  4984. } else {
  4985. ggml_set_zero(grad_acc);
  4986. }
  4987. }
  4988. }
  4989. }
  4990. void ggml_graph_clear(struct ggml_cgraph * cgraph) {
  4991. cgraph->n_leafs = 0;
  4992. cgraph->n_nodes = 0;
  4993. ggml_hash_set_reset(&cgraph->visited_hash_set);
  4994. }
  4995. int ggml_graph_size(struct ggml_cgraph * cgraph) {
  4996. return cgraph->size;
  4997. }
  4998. struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
  4999. if (i < 0) {
  5000. GGML_ASSERT(cgraph->n_nodes + i >= 0);
  5001. return cgraph->nodes[cgraph->n_nodes + i];
  5002. }
  5003. GGML_ASSERT(i < cgraph->n_nodes);
  5004. return cgraph->nodes[i];
  5005. }
  5006. struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
  5007. return cgraph->nodes;
  5008. }
  5009. int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
  5010. return cgraph->n_nodes;
  5011. }
  5012. void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
  5013. GGML_ASSERT(cgraph->size > cgraph->n_nodes);
  5014. cgraph->nodes[cgraph->n_nodes] = tensor;
  5015. cgraph->n_nodes++;
  5016. }
  5017. struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, const char * name) {
  5018. for (int i = 0; i < cgraph->n_leafs; i++) {
  5019. struct ggml_tensor * leaf = cgraph->leafs[i];
  5020. if (strcmp(leaf->name, name) == 0) {
  5021. return leaf;
  5022. }
  5023. }
  5024. for (int i = 0; i < cgraph->n_nodes; i++) {
  5025. struct ggml_tensor * node = cgraph->nodes[i];
  5026. if (strcmp(node->name, name) == 0) {
  5027. return node;
  5028. }
  5029. }
  5030. return NULL;
  5031. }
  5032. struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
  5033. const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
  5034. return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
  5035. }
  5036. struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
  5037. const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
  5038. return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
  5039. }
  5040. void ggml_graph_print(const struct ggml_cgraph * cgraph) {
  5041. GGML_LOG_INFO("=== GRAPH ===\n");
  5042. GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
  5043. for (int i = 0; i < cgraph->n_nodes; i++) {
  5044. struct ggml_tensor * node = cgraph->nodes[i];
  5045. GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
  5046. i,
  5047. node->ne[0], node->ne[1], node->ne[2],
  5048. ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" :
  5049. ggml_graph_get_grad(cgraph, node) ? "g" : " ");
  5050. }
  5051. GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
  5052. for (int i = 0; i < cgraph->n_leafs; i++) {
  5053. struct ggml_tensor * node = cgraph->leafs[i];
  5054. GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
  5055. i,
  5056. node->ne[0], node->ne[1],
  5057. ggml_op_name(node->op),
  5058. ggml_get_name(node));
  5059. }
  5060. GGML_LOG_INFO("========================================\n");
  5061. }
  5062. // check if node is part of the graph
  5063. static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
  5064. if (cgraph == NULL) {
  5065. return true;
  5066. }
  5067. for (int i = 0; i < cgraph->n_nodes; i++) {
  5068. if (cgraph->nodes[i] == node) {
  5069. return true;
  5070. }
  5071. }
  5072. return false;
  5073. }
  5074. static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
  5075. for (int i = 0; i < cgraph->n_nodes; i++) {
  5076. struct ggml_tensor * parent = cgraph->nodes[i];
  5077. struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, parent);
  5078. if (grad == node) {
  5079. return parent;
  5080. }
  5081. }
  5082. return NULL;
  5083. }
  5084. static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
  5085. struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
  5086. struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
  5087. fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
  5088. gparent0 ? (void *) gparent0 : (void *) parent,
  5089. gparent0 ? "g" : "x",
  5090. gparent ? (void *) gparent : (void *) node,
  5091. gparent ? "g" : "x",
  5092. gparent ? "empty" : "vee",
  5093. gparent ? "dashed" : "solid",
  5094. label);
  5095. }
  5096. static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
  5097. fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
  5098. (void *) parent, "x",
  5099. (void *) node, "x",
  5100. label);
  5101. }
  5102. void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
  5103. char color[16];
  5104. FILE * fp = ggml_fopen(filename, "w");
  5105. GGML_ASSERT(fp);
  5106. fprintf(fp, "digraph G {\n");
  5107. fprintf(fp, " newrank = true;\n");
  5108. fprintf(fp, " rankdir = TB;\n");
  5109. for (int i = 0; i < gb->n_nodes; i++) {
  5110. struct ggml_tensor * node = gb->nodes[i];
  5111. struct ggml_tensor * grad = ggml_graph_get_grad(gb, node);
  5112. if (ggml_graph_get_parent(gb, node) != NULL) {
  5113. continue;
  5114. }
  5115. if (node->flags & GGML_TENSOR_FLAG_PARAM) {
  5116. snprintf(color, sizeof(color), "yellow");
  5117. } else if (grad) {
  5118. if (ggml_graph_find(gf, node)) {
  5119. snprintf(color, sizeof(color), "green");
  5120. } else {
  5121. snprintf(color, sizeof(color), "lightblue");
  5122. }
  5123. } else {
  5124. snprintf(color, sizeof(color), "white");
  5125. }
  5126. fprintf(fp, " \"%p\" [ "
  5127. "style = filled; fillcolor = %s; shape = record; "
  5128. "label=\"",
  5129. (void *) node, color);
  5130. if (strlen(node->name) > 0) {
  5131. fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
  5132. } else {
  5133. fprintf(fp, "(%s)|", ggml_type_name(node->type));
  5134. }
  5135. if (ggml_is_matrix(node)) {
  5136. fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], ggml_op_symbol(node->op));
  5137. } else {
  5138. fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], ggml_op_symbol(node->op));
  5139. }
  5140. if (grad) {
  5141. fprintf(fp, " | <g>%s\"; ]\n", ggml_op_symbol(grad->op));
  5142. } else {
  5143. fprintf(fp, "\"; ]\n");
  5144. }
  5145. }
  5146. for (int i = 0; i < gb->n_leafs; i++) {
  5147. struct ggml_tensor * node = gb->leafs[i];
  5148. snprintf(color, sizeof(color), "pink");
  5149. fprintf(fp, " \"%p\" [ "
  5150. "style = filled; fillcolor = %s; shape = record; "
  5151. "label=\"<x>",
  5152. (void *) node, color);
  5153. if (strlen(node->name) > 0) {
  5154. fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
  5155. } else {
  5156. fprintf(fp, "(%s)|", ggml_type_name(node->type));
  5157. }
  5158. fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
  5159. if (ggml_nelements(node) < 5 && node->data != NULL) {
  5160. fprintf(fp, " | (");
  5161. for (int j = 0; j < ggml_nelements(node); j++) {
  5162. // FIXME: use ggml-backend to obtain the tensor data
  5163. //if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
  5164. // fprintf(fp, "%d", ggml_get_i32_1d(node, j));
  5165. //}
  5166. //else if (node->type == GGML_TYPE_F32 ||
  5167. // node->type == GGML_TYPE_F16 ||
  5168. // node->type == GGML_TYPE_BF16) {
  5169. // fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
  5170. //}
  5171. //else
  5172. {
  5173. fprintf(fp, "#");
  5174. }
  5175. if (j < ggml_nelements(node) - 1) {
  5176. fprintf(fp, ", ");
  5177. }
  5178. }
  5179. fprintf(fp, ")");
  5180. }
  5181. fprintf(fp, "\"; ]\n");
  5182. }
  5183. for (int i = 0; i < gb->n_nodes; i++) {
  5184. struct ggml_tensor * node = gb->nodes[i];
  5185. for (int j = 0; j < GGML_MAX_SRC; j++) {
  5186. if (node->src[j]) {
  5187. char label[16];
  5188. snprintf(label, sizeof(label), "src %d", j);
  5189. ggml_graph_dump_dot_node_edge(fp, gb, node, node->src[j], label);
  5190. }
  5191. }
  5192. }
  5193. for (int i = 0; i < gb->n_leafs; i++) {
  5194. struct ggml_tensor * node = gb->leafs[i];
  5195. for (int j = 0; j < GGML_MAX_SRC; j++) {
  5196. if (node->src[j]) {
  5197. char label[16];
  5198. snprintf(label, sizeof(label), "src %d", j);
  5199. ggml_graph_dump_dot_leaf_edge(fp, node, node->src[j], label);
  5200. }
  5201. }
  5202. }
  5203. fprintf(fp, "}\n");
  5204. fclose(fp);
  5205. GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
  5206. }
  5207. ////////////////////////////////////////////////////////////////////////////////
  5208. void ggml_set_input(struct ggml_tensor * tensor) {
  5209. tensor->flags |= GGML_TENSOR_FLAG_INPUT;
  5210. }
  5211. void ggml_set_output(struct ggml_tensor * tensor) {
  5212. tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
  5213. }
  5214. void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
  5215. GGML_UNUSED(ctx); // TODO: remove this parameter
  5216. tensor->flags |= GGML_TENSOR_FLAG_PARAM;
  5217. }
  5218. void ggml_set_loss(struct ggml_tensor * tensor) {
  5219. GGML_ASSERT(ggml_is_scalar(tensor));
  5220. GGML_ASSERT(tensor->type == GGML_TYPE_F32);
  5221. tensor->flags |= GGML_TENSOR_FLAG_LOSS;
  5222. }
  5223. ////////////////////////////////////////////////////////////////////////////////
  5224. void ggml_quantize_init(enum ggml_type type) {
  5225. ggml_critical_section_start();
  5226. switch (type) {
  5227. case GGML_TYPE_IQ2_XXS:
  5228. case GGML_TYPE_IQ2_XS:
  5229. case GGML_TYPE_IQ2_S:
  5230. case GGML_TYPE_IQ1_S:
  5231. case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break;
  5232. case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break;
  5233. case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break;
  5234. default: // nothing
  5235. break;
  5236. }
  5237. ggml_critical_section_end();
  5238. }
  5239. void ggml_quantize_free(void) {
  5240. ggml_critical_section_start();
  5241. iq2xs_free_impl(GGML_TYPE_IQ2_XXS);
  5242. iq2xs_free_impl(GGML_TYPE_IQ2_XS);
  5243. iq2xs_free_impl(GGML_TYPE_IQ1_S);
  5244. iq3xs_free_impl(256);
  5245. ggml_critical_section_end();
  5246. }
  5247. bool ggml_quantize_requires_imatrix(enum ggml_type type) {
  5248. return
  5249. type == GGML_TYPE_IQ2_XXS ||
  5250. type == GGML_TYPE_IQ2_XS ||
  5251. type == GGML_TYPE_IQ1_S;// ||
  5252. //type == GGML_TYPE_IQ1_M;
  5253. }
  5254. size_t ggml_quantize_chunk(
  5255. enum ggml_type type,
  5256. const float * src,
  5257. void * dst,
  5258. int64_t start,
  5259. int64_t nrows,
  5260. int64_t n_per_row,
  5261. const float * imatrix) {
  5262. const int64_t n = (int64_t) nrows * n_per_row;
  5263. if (ggml_quantize_requires_imatrix(type)) {
  5264. GGML_ASSERT(imatrix != NULL);
  5265. }
  5266. GGML_ASSERT(start % type_traits[type].blck_size == 0);
  5267. GGML_ASSERT(start % n_per_row == 0);
  5268. ggml_quantize_init(type); // this is noop if already initialized
  5269. const size_t start_row = start / n_per_row;
  5270. const size_t row_size = ggml_row_size(type, n_per_row);
  5271. size_t result = 0;
  5272. switch (type) {
  5273. case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5274. case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5275. case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5276. case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5277. case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5278. case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5279. case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5280. case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5281. case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5282. case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5283. case GGML_TYPE_TQ1_0: result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5284. case GGML_TYPE_TQ2_0: result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5285. case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5286. case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5287. case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5288. case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5289. case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5290. case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5291. case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5292. case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5293. case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
  5294. case GGML_TYPE_F16:
  5295. {
  5296. size_t elemsize = sizeof(ggml_fp16_t);
  5297. ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
  5298. result = n * elemsize;
  5299. } break;
  5300. case GGML_TYPE_BF16:
  5301. {
  5302. size_t elemsize = sizeof(ggml_bf16_t);
  5303. ggml_fp32_to_bf16_row_ref(src + start, (ggml_bf16_t *)dst + start, n);
  5304. result = n * elemsize;
  5305. } break;
  5306. case GGML_TYPE_F32:
  5307. {
  5308. size_t elemsize = sizeof(float);
  5309. result = n * elemsize;
  5310. memcpy((uint8_t *)dst + start * elemsize, src + start, result);
  5311. } break;
  5312. default:
  5313. assert(false);
  5314. }
  5315. GGML_ASSERT(result == nrows * row_size);
  5316. return result;
  5317. }
  5318. ////////////////////////////////////////////////////////////////////////////////
  5319. struct gguf_str {
  5320. uint64_t n; // GGUFv2
  5321. char * data;
  5322. };
  5323. static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
  5324. [GGUF_TYPE_UINT8] = sizeof(uint8_t),
  5325. [GGUF_TYPE_INT8] = sizeof(int8_t),
  5326. [GGUF_TYPE_UINT16] = sizeof(uint16_t),
  5327. [GGUF_TYPE_INT16] = sizeof(int16_t),
  5328. [GGUF_TYPE_UINT32] = sizeof(uint32_t),
  5329. [GGUF_TYPE_INT32] = sizeof(int32_t),
  5330. [GGUF_TYPE_FLOAT32] = sizeof(float),
  5331. [GGUF_TYPE_BOOL] = sizeof(bool),
  5332. [GGUF_TYPE_STRING] = sizeof(struct gguf_str),
  5333. [GGUF_TYPE_UINT64] = sizeof(uint64_t),
  5334. [GGUF_TYPE_INT64] = sizeof(int64_t),
  5335. [GGUF_TYPE_FLOAT64] = sizeof(double),
  5336. [GGUF_TYPE_ARRAY] = 0, // undefined
  5337. };
  5338. static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
  5339. static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
  5340. [GGUF_TYPE_UINT8] = "u8",
  5341. [GGUF_TYPE_INT8] = "i8",
  5342. [GGUF_TYPE_UINT16] = "u16",
  5343. [GGUF_TYPE_INT16] = "i16",
  5344. [GGUF_TYPE_UINT32] = "u32",
  5345. [GGUF_TYPE_INT32] = "i32",
  5346. [GGUF_TYPE_FLOAT32] = "f32",
  5347. [GGUF_TYPE_BOOL] = "bool",
  5348. [GGUF_TYPE_STRING] = "str",
  5349. [GGUF_TYPE_ARRAY] = "arr",
  5350. [GGUF_TYPE_UINT64] = "u64",
  5351. [GGUF_TYPE_INT64] = "i64",
  5352. [GGUF_TYPE_FLOAT64] = "f64",
  5353. };
  5354. static_assert(GGUF_TYPE_COUNT == 13, "GGUF_TYPE_COUNT != 13");
  5355. union gguf_value {
  5356. uint8_t uint8;
  5357. int8_t int8;
  5358. uint16_t uint16;
  5359. int16_t int16;
  5360. uint32_t uint32;
  5361. int32_t int32;
  5362. float float32;
  5363. uint64_t uint64;
  5364. int64_t int64;
  5365. double float64;
  5366. bool bool_;
  5367. struct gguf_str str;
  5368. struct {
  5369. enum gguf_type type;
  5370. uint64_t n; // GGUFv2
  5371. void * data;
  5372. } arr;
  5373. };
  5374. struct gguf_kv {
  5375. struct gguf_str key;
  5376. enum gguf_type type;
  5377. union gguf_value value;
  5378. };
  5379. struct gguf_header {
  5380. char magic[4];
  5381. uint32_t version;
  5382. uint64_t n_tensors; // GGUFv2
  5383. uint64_t n_kv; // GGUFv2
  5384. };
  5385. struct gguf_tensor_info {
  5386. struct gguf_str name;
  5387. uint32_t n_dims;
  5388. uint64_t ne[GGML_MAX_DIMS];
  5389. enum ggml_type type;
  5390. uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
  5391. // for writing API
  5392. const void * data;
  5393. size_t size;
  5394. };
  5395. struct gguf_context {
  5396. struct gguf_header header;
  5397. struct gguf_kv * kv;
  5398. struct gguf_tensor_info * infos;
  5399. size_t alignment;
  5400. size_t offset; // offset of `data` from beginning of file
  5401. size_t size; // size of `data` in bytes
  5402. //uint8_t * padding;
  5403. void * data;
  5404. };
  5405. static size_t gguf_type_size(enum gguf_type type) {
  5406. GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT);
  5407. return GGUF_TYPE_SIZE[type];
  5408. }
  5409. static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
  5410. if (info->n_dims > GGML_MAX_DIMS) {
  5411. fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
  5412. return false;
  5413. }
  5414. if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
  5415. fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
  5416. return false;
  5417. }
  5418. if (strlen(info->name.data) >= GGML_MAX_NAME) {
  5419. fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
  5420. return false;
  5421. }
  5422. for (uint32_t i = 0; i < info->n_dims; ++i) {
  5423. if (info->ne[i] <= 0) {
  5424. fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
  5425. return false;
  5426. }
  5427. }
  5428. // prevent overflow for total number of elements
  5429. if (INT64_MAX/info->ne[1] <= info->ne[0]) {
  5430. fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
  5431. return false;
  5432. }
  5433. if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
  5434. fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
  5435. return false;
  5436. }
  5437. if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
  5438. fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
  5439. return false;
  5440. }
  5441. return true;
  5442. }
  5443. static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
  5444. const size_t n = fread(dst, 1, size, file);
  5445. *offset += n;
  5446. return n == size;
  5447. }
  5448. static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
  5449. p->n = 0;
  5450. p->data = NULL;
  5451. bool ok = true;
  5452. ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
  5453. // early exit if string length is invalid, prevents from integer overflow
  5454. if (p->n == SIZE_MAX) {
  5455. fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
  5456. return false;
  5457. }
  5458. p->data = calloc(p->n + 1, 1);
  5459. if (!p->data) {
  5460. fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
  5461. return false;
  5462. }
  5463. ok = ok && gguf_fread_el(file, p->data, p->n, offset);
  5464. return ok;
  5465. }
  5466. static void gguf_free_kv(struct gguf_kv * kv) {
  5467. if (kv->key.data) {
  5468. GGML_FREE(kv->key.data);
  5469. }
  5470. if (kv->type == GGUF_TYPE_STRING) {
  5471. if (kv->value.str.data) {
  5472. GGML_FREE(kv->value.str.data);
  5473. }
  5474. }
  5475. if (kv->type == GGUF_TYPE_ARRAY) {
  5476. if (kv->value.arr.data) {
  5477. if (kv->value.arr.type == GGUF_TYPE_STRING) {
  5478. for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
  5479. struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
  5480. if (str->data) {
  5481. GGML_FREE(str->data);
  5482. }
  5483. }
  5484. }
  5485. GGML_FREE(kv->value.arr.data);
  5486. }
  5487. }
  5488. }
  5489. struct gguf_context * gguf_init_empty(void) {
  5490. struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
  5491. if (!ctx) {
  5492. fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
  5493. return NULL;
  5494. }
  5495. memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
  5496. ctx->header.version = GGUF_VERSION;
  5497. ctx->header.n_tensors = 0;
  5498. ctx->header.n_kv = 0;
  5499. ctx->kv = NULL;
  5500. ctx->infos = NULL;
  5501. ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
  5502. ctx->offset = 0;
  5503. ctx->size = 0;
  5504. ctx->data = NULL;
  5505. return ctx;
  5506. }
  5507. struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
  5508. FILE * file = ggml_fopen(fname, "rb");
  5509. if (!file) {
  5510. fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
  5511. return NULL;
  5512. }
  5513. // offset from start of file
  5514. size_t offset = 0;
  5515. char magic[4];
  5516. // check the magic before making allocations
  5517. {
  5518. gguf_fread_el(file, &magic, sizeof(magic), &offset);
  5519. for (uint32_t i = 0; i < sizeof(magic); i++) {
  5520. if (magic[i] != GGUF_MAGIC[i]) {
  5521. fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
  5522. fclose(file);
  5523. return NULL;
  5524. }
  5525. }
  5526. }
  5527. bool ok = true;
  5528. struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
  5529. if (!ctx) {
  5530. fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
  5531. fclose(file);
  5532. return NULL;
  5533. }
  5534. // read the header
  5535. {
  5536. strncpy(ctx->header.magic, magic, 4);
  5537. ctx->kv = NULL;
  5538. ctx->infos = NULL;
  5539. ctx->data = NULL;
  5540. ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset);
  5541. ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
  5542. ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset);
  5543. if (ctx->header.version == 1) {
  5544. fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
  5545. fclose(file);
  5546. gguf_free(ctx);
  5547. return NULL;
  5548. }
  5549. // sanity-checks to prevent from integer/buffer overflows
  5550. ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
  5551. ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
  5552. ok = ok && (ctx->header.n_kv < (SIZE_MAX/2)/sizeof(struct gguf_kv));
  5553. if (!ok) {
  5554. fprintf(stderr, "%s: failed to read header\n", __func__);
  5555. fclose(file);
  5556. gguf_free(ctx);
  5557. return NULL;
  5558. }
  5559. }
  5560. // read the kv pairs
  5561. {
  5562. const uint64_t n_kv = ctx->header.n_kv;
  5563. ctx->kv = calloc(n_kv, sizeof(struct gguf_kv));
  5564. if (!ctx->kv) {
  5565. fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
  5566. fclose(file);
  5567. gguf_free(ctx);
  5568. return NULL;
  5569. }
  5570. for (uint64_t i = 0; i < n_kv; ++i) {
  5571. struct gguf_kv * kv = &ctx->kv[i];
  5572. //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
  5573. ok = ok && gguf_fread_str(file, &kv->key, &offset);
  5574. ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
  5575. //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
  5576. switch (kv->type) {
  5577. case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break;
  5578. case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break;
  5579. case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break;
  5580. case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break;
  5581. case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break;
  5582. case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break;
  5583. case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
  5584. case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break;
  5585. case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break;
  5586. case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
  5587. case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break;
  5588. case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break;
  5589. case GGUF_TYPE_ARRAY:
  5590. {
  5591. ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
  5592. ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
  5593. switch (kv->value.arr.type) {
  5594. case GGUF_TYPE_UINT8:
  5595. case GGUF_TYPE_INT8:
  5596. case GGUF_TYPE_UINT16:
  5597. case GGUF_TYPE_INT16:
  5598. case GGUF_TYPE_UINT32:
  5599. case GGUF_TYPE_INT32:
  5600. case GGUF_TYPE_FLOAT32:
  5601. case GGUF_TYPE_UINT64:
  5602. case GGUF_TYPE_INT64:
  5603. case GGUF_TYPE_FLOAT64:
  5604. case GGUF_TYPE_BOOL:
  5605. {
  5606. // prevent from integer overflow in the malloc below
  5607. if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
  5608. fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
  5609. fclose(file);
  5610. gguf_free(ctx);
  5611. return NULL;
  5612. }
  5613. kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
  5614. if (!kv->value.arr.data) {
  5615. fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
  5616. fclose(file);
  5617. gguf_free(ctx);
  5618. return NULL;
  5619. }
  5620. ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
  5621. } break;
  5622. case GGUF_TYPE_STRING:
  5623. {
  5624. // prevent from integer overflow in the malloc below
  5625. if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
  5626. fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
  5627. fclose(file);
  5628. gguf_free(ctx);
  5629. return NULL;
  5630. }
  5631. kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
  5632. if (!kv->value.arr.data) {
  5633. fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
  5634. fclose(file);
  5635. gguf_free(ctx);
  5636. return NULL;
  5637. }
  5638. for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
  5639. ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
  5640. }
  5641. } break;
  5642. case GGUF_TYPE_ARRAY:
  5643. default:
  5644. {
  5645. fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
  5646. ok = false;
  5647. } break;
  5648. }
  5649. } break;
  5650. default:
  5651. {
  5652. fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
  5653. ok = false;
  5654. } break;
  5655. }
  5656. if (!ok) {
  5657. break;
  5658. }
  5659. }
  5660. if (!ok) {
  5661. fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
  5662. fclose(file);
  5663. gguf_free(ctx);
  5664. return NULL;
  5665. }
  5666. }
  5667. // read the tensor infos
  5668. if (ctx->header.n_tensors > 0) {
  5669. ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
  5670. if (!ctx->infos) {
  5671. fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
  5672. fclose(file);
  5673. gguf_free(ctx);
  5674. return NULL;
  5675. }
  5676. for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
  5677. struct gguf_tensor_info * info = &ctx->infos[i];
  5678. for (int j = 0; j < GGML_MAX_DIMS; ++j) {
  5679. info->ne[j] = 1;
  5680. }
  5681. ok = ok && gguf_fread_str(file, &info->name, &offset);
  5682. ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset);
  5683. ok = ok && (info->n_dims <= GGML_MAX_DIMS);
  5684. for (uint32_t j = 0; j < info->n_dims; ++j) {
  5685. ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
  5686. }
  5687. ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset);
  5688. ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset);
  5689. ok = ok && gguf_tensor_info_sanitize(info);
  5690. // make sure there is no duplicated tensor names
  5691. for (uint64_t j = 0; j < i && ok; ++j) {
  5692. if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
  5693. fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
  5694. ok = false;
  5695. }
  5696. }
  5697. if (!ok) {
  5698. fprintf(stderr, "%s: failed to read tensor info\n", __func__);
  5699. fclose(file);
  5700. gguf_free(ctx);
  5701. return NULL;
  5702. }
  5703. }
  5704. }
  5705. ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
  5706. int alignment_idx = gguf_find_key(ctx, "general.alignment");
  5707. if (alignment_idx != -1) {
  5708. ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
  5709. }
  5710. // we require the data section to be aligned, so take into account any padding
  5711. {
  5712. const size_t offset_pad = offset % ctx->alignment;
  5713. if (offset_pad != 0) {
  5714. offset += ctx->alignment - offset_pad;
  5715. fseek(file, offset, SEEK_SET);
  5716. }
  5717. }
  5718. // store the current file offset - this is where the data section starts
  5719. ctx->offset = offset;
  5720. // compute the total size of the data section, taking into account the alignment
  5721. {
  5722. ctx->size = 0;
  5723. for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
  5724. struct gguf_tensor_info * info = &ctx->infos[i];
  5725. const int64_t ne =
  5726. (int64_t) info->ne[0] *
  5727. (int64_t) info->ne[1] *
  5728. (int64_t) info->ne[2] *
  5729. (int64_t) info->ne[3];
  5730. if (ggml_blck_size(info->type) == 0 ) {
  5731. // this tensor type support have been removed:
  5732. fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
  5733. __func__, info->name.data, (int) info->type, ggml_type_name(info->type));
  5734. fclose(file);
  5735. gguf_free(ctx);
  5736. return NULL;
  5737. }
  5738. if (ne % ggml_blck_size(info->type) != 0) {
  5739. fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
  5740. __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
  5741. fclose(file);
  5742. gguf_free(ctx);
  5743. return NULL;
  5744. }
  5745. const size_t size_cur = ggml_row_size(info->type, ne);
  5746. ctx->size += GGML_PAD(size_cur, ctx->alignment);
  5747. }
  5748. }
  5749. // load the tensor data only if requested
  5750. if (params.ctx != NULL) {
  5751. // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
  5752. // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
  5753. // the ggml_tensor structs to the appropriate locations in the binary blob
  5754. // compute the exact size needed for the new ggml_context
  5755. const size_t mem_size =
  5756. params.no_alloc ?
  5757. (ctx->header.n_tensors )*ggml_tensor_overhead() :
  5758. (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
  5759. struct ggml_init_params pdata = {
  5760. .mem_size = mem_size,
  5761. .mem_buffer = NULL,
  5762. .no_alloc = params.no_alloc,
  5763. };
  5764. *params.ctx = ggml_init(pdata);
  5765. if (*params.ctx == NULL) {
  5766. fprintf(stderr, "%s: failed to initialize context\n", __func__);
  5767. fclose(file);
  5768. gguf_free(ctx);
  5769. return NULL;
  5770. }
  5771. struct ggml_context * ctx_data = *params.ctx;
  5772. struct ggml_tensor * data = NULL;
  5773. if (!params.no_alloc) {
  5774. data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
  5775. ok = ok && data != NULL;
  5776. // read the binary blob with the tensor data
  5777. ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
  5778. if (!ok) {
  5779. fprintf(stderr, "%s: failed to read tensor data\n", __func__);
  5780. fclose(file);
  5781. ggml_free(ctx_data);
  5782. gguf_free(ctx);
  5783. return NULL;
  5784. }
  5785. ctx->data = data->data;
  5786. }
  5787. ggml_set_no_alloc(ctx_data, true);
  5788. // create the tensors
  5789. for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
  5790. const int64_t ne[GGML_MAX_DIMS] = {
  5791. ctx->infos[i].ne[0],
  5792. ctx->infos[i].ne[1],
  5793. ctx->infos[i].ne[2],
  5794. ctx->infos[i].ne[3],
  5795. };
  5796. struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
  5797. ok = ok && cur != NULL;
  5798. if (!ok) {
  5799. break;
  5800. }
  5801. ggml_set_name(cur, ctx->infos[i].name.data);
  5802. // point the data member to the appropriate location in the binary blob using the tensor infos
  5803. if (!params.no_alloc) {
  5804. //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
  5805. cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
  5806. }
  5807. }
  5808. if (!ok) {
  5809. fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
  5810. fclose(file);
  5811. ggml_free(ctx_data);
  5812. gguf_free(ctx);
  5813. return NULL;
  5814. }
  5815. ggml_set_no_alloc(ctx_data, params.no_alloc);
  5816. }
  5817. fclose(file);
  5818. return ctx;
  5819. }
  5820. void gguf_free(struct gguf_context * ctx) {
  5821. if (ctx == NULL) {
  5822. return;
  5823. }
  5824. if (ctx->kv) {
  5825. // free string memory - not great..
  5826. for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
  5827. gguf_free_kv(&ctx->kv[i]);
  5828. }
  5829. GGML_FREE(ctx->kv);
  5830. }
  5831. if (ctx->infos) {
  5832. for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
  5833. struct gguf_tensor_info * info = &ctx->infos[i];
  5834. if (info->name.data) {
  5835. GGML_FREE(info->name.data);
  5836. }
  5837. }
  5838. GGML_FREE(ctx->infos);
  5839. }
  5840. GGML_FREE(ctx);
  5841. }
  5842. const char * gguf_type_name(enum gguf_type type) {
  5843. return GGUF_TYPE_NAME[type];
  5844. }
  5845. int gguf_get_version(const struct gguf_context * ctx) {
  5846. return ctx->header.version;
  5847. }
  5848. size_t gguf_get_alignment(const struct gguf_context * ctx) {
  5849. return ctx->alignment;
  5850. }
  5851. size_t gguf_get_data_offset(const struct gguf_context * ctx) {
  5852. return ctx->offset;
  5853. }
  5854. void * gguf_get_data(const struct gguf_context * ctx) {
  5855. return ctx->data;
  5856. }
  5857. int gguf_get_n_kv(const struct gguf_context * ctx) {
  5858. return ctx->header.n_kv;
  5859. }
  5860. int gguf_find_key(const struct gguf_context * ctx, const char * key) {
  5861. // return -1 if key not found
  5862. int keyfound = -1;
  5863. const int n_kv = gguf_get_n_kv(ctx);
  5864. for (int i = 0; i < n_kv; ++i) {
  5865. if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
  5866. keyfound = i;
  5867. break;
  5868. }
  5869. }
  5870. return keyfound;
  5871. }
  5872. const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
  5873. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5874. return ctx->kv[key_id].key.data;
  5875. }
  5876. enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
  5877. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5878. return ctx->kv[key_id].type;
  5879. }
  5880. enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
  5881. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5882. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
  5883. return ctx->kv[key_id].value.arr.type;
  5884. }
  5885. const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
  5886. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5887. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
  5888. return ctx->kv[key_id].value.arr.data;
  5889. }
  5890. const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
  5891. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5892. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
  5893. struct gguf_kv * kv = &ctx->kv[key_id];
  5894. struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
  5895. return str->data;
  5896. }
  5897. int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
  5898. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5899. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
  5900. return ctx->kv[key_id].value.arr.n;
  5901. }
  5902. uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
  5903. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5904. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
  5905. return ctx->kv[key_id].value.uint8;
  5906. }
  5907. int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
  5908. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5909. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
  5910. return ctx->kv[key_id].value.int8;
  5911. }
  5912. uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
  5913. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5914. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
  5915. return ctx->kv[key_id].value.uint16;
  5916. }
  5917. int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
  5918. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5919. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
  5920. return ctx->kv[key_id].value.int16;
  5921. }
  5922. uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
  5923. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5924. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
  5925. return ctx->kv[key_id].value.uint32;
  5926. }
  5927. int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
  5928. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5929. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
  5930. return ctx->kv[key_id].value.int32;
  5931. }
  5932. float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
  5933. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5934. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
  5935. return ctx->kv[key_id].value.float32;
  5936. }
  5937. uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
  5938. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5939. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
  5940. return ctx->kv[key_id].value.uint64;
  5941. }
  5942. int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
  5943. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5944. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
  5945. return ctx->kv[key_id].value.int64;
  5946. }
  5947. double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
  5948. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5949. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
  5950. return ctx->kv[key_id].value.float64;
  5951. }
  5952. bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
  5953. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5954. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
  5955. return ctx->kv[key_id].value.bool_;
  5956. }
  5957. const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
  5958. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5959. GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
  5960. return ctx->kv[key_id].value.str.data;
  5961. }
  5962. const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
  5963. GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
  5964. GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
  5965. GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
  5966. return &ctx->kv[key_id].value;
  5967. }
  5968. int gguf_get_n_tensors(const struct gguf_context * ctx) {
  5969. return ctx->header.n_tensors;
  5970. }
  5971. int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
  5972. // return -1 if tensor not found
  5973. int tensorfound = -1;
  5974. const int n_tensors = gguf_get_n_tensors(ctx);
  5975. for (int i = 0; i < n_tensors; ++i) {
  5976. if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
  5977. tensorfound = i;
  5978. break;
  5979. }
  5980. }
  5981. return tensorfound;
  5982. }
  5983. size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
  5984. return ctx->infos[i].offset;
  5985. }
  5986. char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
  5987. return ctx->infos[i].name.data;
  5988. }
  5989. enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
  5990. return ctx->infos[i].type;
  5991. }
  5992. // returns the index
  5993. static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
  5994. const int idx = gguf_find_key(ctx, key);
  5995. if (idx >= 0) {
  5996. return idx;
  5997. }
  5998. const int n_kv = gguf_get_n_kv(ctx);
  5999. ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
  6000. ctx->kv[n_kv].key.n = strlen(key);
  6001. ctx->kv[n_kv].key.data = strdup(key);
  6002. ctx->header.n_kv++;
  6003. return n_kv;
  6004. }
  6005. void gguf_remove_key(struct gguf_context * ctx, const char * key) {
  6006. const int idx = gguf_find_key(ctx, key);
  6007. if (idx >= 0) {
  6008. const int n_kv = gguf_get_n_kv(ctx);
  6009. gguf_free_kv(&ctx->kv[idx]);
  6010. for (int i = idx; i < n_kv-1; ++i) {
  6011. ctx->kv[i] = ctx->kv[i+1];
  6012. }
  6013. ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
  6014. ctx->header.n_kv--;
  6015. }
  6016. }
  6017. void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
  6018. const int idx = gguf_get_or_add_key(ctx, key);
  6019. ctx->kv[idx].type = GGUF_TYPE_UINT8;
  6020. ctx->kv[idx].value.uint8 = val;
  6021. }
  6022. void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
  6023. const int idx = gguf_get_or_add_key(ctx, key);
  6024. ctx->kv[idx].type = GGUF_TYPE_INT8;
  6025. ctx->kv[idx].value.int8 = val;
  6026. }
  6027. void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
  6028. const int idx = gguf_get_or_add_key(ctx, key);
  6029. ctx->kv[idx].type = GGUF_TYPE_UINT16;
  6030. ctx->kv[idx].value.uint16 = val;
  6031. }
  6032. void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
  6033. const int idx = gguf_get_or_add_key(ctx, key);
  6034. ctx->kv[idx].type = GGUF_TYPE_INT16;
  6035. ctx->kv[idx].value.int16 = val;
  6036. }
  6037. void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
  6038. const int idx = gguf_get_or_add_key(ctx, key);
  6039. ctx->kv[idx].type = GGUF_TYPE_UINT32;
  6040. ctx->kv[idx].value.uint32 = val;
  6041. }
  6042. void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
  6043. const int idx = gguf_get_or_add_key(ctx, key);
  6044. ctx->kv[idx].type = GGUF_TYPE_INT32;
  6045. ctx->kv[idx].value.int32 = val;
  6046. }
  6047. void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
  6048. const int idx = gguf_get_or_add_key(ctx, key);
  6049. ctx->kv[idx].type = GGUF_TYPE_FLOAT32;
  6050. ctx->kv[idx].value.float32 = val;
  6051. }
  6052. void gguf_set_val_u64(struct gguf_context * ctx, const char * key, uint64_t val) {
  6053. const int idx = gguf_get_or_add_key(ctx, key);
  6054. ctx->kv[idx].type = GGUF_TYPE_UINT64;
  6055. ctx->kv[idx].value.uint64 = val;
  6056. }
  6057. void gguf_set_val_i64(struct gguf_context * ctx, const char * key, int64_t val) {
  6058. const int idx = gguf_get_or_add_key(ctx, key);
  6059. ctx->kv[idx].type = GGUF_TYPE_INT64;
  6060. ctx->kv[idx].value.int64 = val;
  6061. }
  6062. void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
  6063. const int idx = gguf_get_or_add_key(ctx, key);
  6064. ctx->kv[idx].type = GGUF_TYPE_FLOAT64;
  6065. ctx->kv[idx].value.float64 = val;
  6066. }
  6067. void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
  6068. const int idx = gguf_get_or_add_key(ctx, key);
  6069. ctx->kv[idx].type = GGUF_TYPE_BOOL;
  6070. ctx->kv[idx].value.bool_ = val;
  6071. }
  6072. void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
  6073. const int idx = gguf_get_or_add_key(ctx, key);
  6074. ctx->kv[idx].type = GGUF_TYPE_STRING;
  6075. ctx->kv[idx].value.str.n = strlen(val);
  6076. ctx->kv[idx].value.str.data = strdup(val);
  6077. }
  6078. void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
  6079. const int idx = gguf_get_or_add_key(ctx, key);
  6080. ctx->kv[idx].type = GGUF_TYPE_ARRAY;
  6081. ctx->kv[idx].value.arr.type = type;
  6082. ctx->kv[idx].value.arr.n = n;
  6083. ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
  6084. memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
  6085. }
  6086. void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
  6087. const int idx = gguf_get_or_add_key(ctx, key);
  6088. ctx->kv[idx].type = GGUF_TYPE_ARRAY;
  6089. ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
  6090. ctx->kv[idx].value.arr.n = n;
  6091. ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
  6092. for (int i = 0; i < n; i++) {
  6093. struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
  6094. str->n = strlen(data[i]);
  6095. str->data = strdup(data[i]);
  6096. }
  6097. }
  6098. // set or add KV pairs from another context
  6099. void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
  6100. for (uint32_t i = 0; i < src->header.n_kv; i++) {
  6101. switch (src->kv[i].type) {
  6102. case GGUF_TYPE_UINT8: gguf_set_val_u8 (ctx, src->kv[i].key.data, src->kv[i].value.uint8); break;
  6103. case GGUF_TYPE_INT8: gguf_set_val_i8 (ctx, src->kv[i].key.data, src->kv[i].value.int8); break;
  6104. case GGUF_TYPE_UINT16: gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16); break;
  6105. case GGUF_TYPE_INT16: gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16); break;
  6106. case GGUF_TYPE_UINT32: gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32); break;
  6107. case GGUF_TYPE_INT32: gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32); break;
  6108. case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32); break;
  6109. case GGUF_TYPE_UINT64: gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64); break;
  6110. case GGUF_TYPE_INT64: gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64); break;
  6111. case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64); break;
  6112. case GGUF_TYPE_BOOL: gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_); break;
  6113. case GGUF_TYPE_STRING: gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
  6114. case GGUF_TYPE_ARRAY:
  6115. {
  6116. if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
  6117. const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
  6118. for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
  6119. data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
  6120. }
  6121. gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
  6122. GGML_FREE((void *)data);
  6123. } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
  6124. GGML_ABORT("nested arrays not supported");
  6125. } else {
  6126. gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
  6127. }
  6128. } break;
  6129. default: GGML_ABORT("invalid type");
  6130. }
  6131. }
  6132. }
  6133. void gguf_add_tensor(
  6134. struct gguf_context * ctx,
  6135. const struct ggml_tensor * tensor) {
  6136. GGML_ASSERT(tensor);
  6137. if (gguf_find_tensor(ctx, tensor->name) != -1) {
  6138. GGML_ABORT("duplicated tensor name");
  6139. }
  6140. const int idx = ctx->header.n_tensors;
  6141. ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
  6142. ctx->infos[idx].name.n = strlen(tensor->name);
  6143. ctx->infos[idx].name.data = strdup(tensor->name);
  6144. for (int i = 0; i < GGML_MAX_DIMS; ++i) {
  6145. ctx->infos[idx].ne[i] = 1;
  6146. }
  6147. ctx->infos[idx].n_dims = ggml_n_dims(tensor);
  6148. for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
  6149. ctx->infos[idx].ne[i] = tensor->ne[i];
  6150. }
  6151. ctx->infos[idx].type = tensor->type;
  6152. ctx->infos[idx].offset = 0;
  6153. ctx->infos[idx].data = tensor->data;
  6154. ctx->infos[idx].size = ggml_nbytes(tensor);
  6155. if (ctx->header.n_tensors > 0) {
  6156. ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
  6157. }
  6158. ctx->header.n_tensors++;
  6159. }
  6160. void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
  6161. const int idx = gguf_find_tensor(ctx, name);
  6162. if (idx < 0) {
  6163. GGML_ABORT("tensor not found");
  6164. }
  6165. ctx->infos[idx].type = type;
  6166. }
  6167. void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
  6168. const int idx = gguf_find_tensor(ctx, name);
  6169. if (idx < 0) {
  6170. GGML_ABORT("tensor not found");
  6171. }
  6172. ctx->infos[idx].data = data;
  6173. ctx->infos[idx].size = size;
  6174. // update offsets
  6175. for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
  6176. ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
  6177. }
  6178. }
  6179. //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
  6180. // fwrite(&val->n, sizeof(val->n), 1, file);
  6181. // fwrite(val->data, sizeof(char), val->n, file);
  6182. //}
  6183. //
  6184. //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
  6185. // fwrite(val, sizeof(char), size, file);
  6186. //}
  6187. struct gguf_buf {
  6188. void * data;
  6189. size_t size;
  6190. size_t offset;
  6191. };
  6192. static struct gguf_buf gguf_buf_init(size_t size) {
  6193. struct gguf_buf buf = {
  6194. /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
  6195. /*buf.size =*/ size,
  6196. /*buf.offset =*/ 0,
  6197. };
  6198. return buf;
  6199. }
  6200. static void gguf_buf_free(struct gguf_buf buf) {
  6201. if (buf.data) {
  6202. GGML_FREE(buf.data);
  6203. }
  6204. }
  6205. static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
  6206. if (buf->offset + size > buf->size) {
  6207. buf->size = 1.5*(buf->offset + size);
  6208. if (buf->data) {
  6209. buf->data = realloc(buf->data, buf->size);
  6210. }
  6211. }
  6212. }
  6213. static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
  6214. gguf_buf_grow(buf, sizeof(val->n) + val->n);
  6215. if (buf->data) {
  6216. memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
  6217. }
  6218. buf->offset += sizeof(val->n);
  6219. if (buf->data) {
  6220. memcpy((char *) buf->data + buf->offset, val->data, val->n);
  6221. }
  6222. buf->offset += val->n;
  6223. }
  6224. static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
  6225. gguf_buf_grow(buf, el_size);
  6226. if (buf->data) {
  6227. memcpy((char *) buf->data + buf->offset, val, el_size);
  6228. }
  6229. buf->offset += el_size;
  6230. }
  6231. static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
  6232. // write header
  6233. gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
  6234. gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
  6235. gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
  6236. gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
  6237. // write key-value pairs
  6238. for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
  6239. struct gguf_kv * kv = &ctx->kv[i];
  6240. gguf_bwrite_str(buf, &kv->key);
  6241. gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
  6242. switch (kv->type) {
  6243. case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
  6244. case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
  6245. case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
  6246. case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
  6247. case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
  6248. case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
  6249. case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
  6250. case GGUF_TYPE_UINT64: gguf_bwrite_el (buf, &kv->value.uint64, sizeof(kv->value.uint64) ); break;
  6251. case GGUF_TYPE_INT64: gguf_bwrite_el (buf, &kv->value.int64, sizeof(kv->value.int64) ); break;
  6252. case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
  6253. case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
  6254. case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
  6255. case GGUF_TYPE_ARRAY:
  6256. {
  6257. gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
  6258. gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
  6259. switch (kv->value.arr.type) {
  6260. case GGUF_TYPE_UINT8:
  6261. case GGUF_TYPE_INT8:
  6262. case GGUF_TYPE_UINT16:
  6263. case GGUF_TYPE_INT16:
  6264. case GGUF_TYPE_UINT32:
  6265. case GGUF_TYPE_INT32:
  6266. case GGUF_TYPE_FLOAT32:
  6267. case GGUF_TYPE_UINT64:
  6268. case GGUF_TYPE_INT64:
  6269. case GGUF_TYPE_FLOAT64:
  6270. case GGUF_TYPE_BOOL:
  6271. {
  6272. gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type));
  6273. } break;
  6274. case GGUF_TYPE_STRING:
  6275. {
  6276. for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
  6277. gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
  6278. }
  6279. } break;
  6280. case GGUF_TYPE_ARRAY:
  6281. default: GGML_ABORT("invalid type");
  6282. }
  6283. } break;
  6284. default: GGML_ABORT("invalid type");
  6285. }
  6286. }
  6287. // write tensor infos
  6288. for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
  6289. struct gguf_tensor_info * info = &ctx->infos[i];
  6290. gguf_bwrite_str(buf, &info->name);
  6291. gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
  6292. for (uint32_t j = 0; j < info->n_dims; ++j) {
  6293. gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
  6294. }
  6295. gguf_bwrite_el(buf, &info->type, sizeof(info->type));
  6296. gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
  6297. }
  6298. // we require the data section to be aligned, so take into account any padding
  6299. {
  6300. const size_t offset = buf->offset;
  6301. const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
  6302. if (offset_pad != offset) {
  6303. uint8_t pad = 0;
  6304. for (size_t i = 0; i < offset_pad - offset; ++i) {
  6305. gguf_bwrite_el(buf, &pad, sizeof(pad));
  6306. }
  6307. }
  6308. }
  6309. if (only_meta) {
  6310. return;
  6311. }
  6312. size_t offset = 0;
  6313. // write tensor data
  6314. for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
  6315. struct gguf_tensor_info * info = &ctx->infos[i];
  6316. const size_t size = info->size;
  6317. const size_t size_pad = GGML_PAD(size, ctx->alignment);
  6318. gguf_bwrite_el(buf, info->data, size);
  6319. if (size_pad != size) {
  6320. uint8_t pad = 0;
  6321. for (size_t j = 0; j < size_pad - size; ++j) {
  6322. gguf_bwrite_el(buf, &pad, sizeof(pad));
  6323. }
  6324. }
  6325. GGML_ASSERT(offset == info->offset);
  6326. offset += size_pad;
  6327. }
  6328. }
  6329. void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) {
  6330. FILE * file = ggml_fopen(fname, "wb");
  6331. if (!file) {
  6332. GGML_ABORT("failed to open file for writing");
  6333. }
  6334. struct gguf_buf buf = gguf_buf_init(16*1024);
  6335. gguf_write_to_buf(ctx, &buf, only_meta);
  6336. fwrite(buf.data, 1, buf.offset, file);
  6337. gguf_buf_free(buf);
  6338. fclose(file);
  6339. }
  6340. size_t gguf_get_meta_size(const struct gguf_context * ctx) {
  6341. // no allocs - only compute size
  6342. struct gguf_buf buf = gguf_buf_init(0);
  6343. gguf_write_to_buf(ctx, &buf, true);
  6344. return buf.offset;
  6345. }
  6346. void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
  6347. struct gguf_buf buf = gguf_buf_init(16*1024);
  6348. gguf_write_to_buf(ctx, &buf, true);
  6349. memcpy(data, buf.data, buf.offset);
  6350. gguf_buf_free(buf);
  6351. }
  6352. void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
  6353. g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
  6354. g_logger_state.log_callback_user_data = user_data;
  6355. }
  6356. void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) {
  6357. p->n_threads = n_threads;
  6358. p->prio = 0; // default priority (usually means normal or inherited)
  6359. p->poll = 50; // hybrid-polling enabled
  6360. p->strict_cpu = false; // no strict placement (all threads share same cpumask)
  6361. p->paused = false; // threads are ready to go
  6362. memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
  6363. }
  6364. struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
  6365. struct ggml_threadpool_params p;
  6366. ggml_threadpool_params_init(&p, n_threads);
  6367. return p;
  6368. }
  6369. bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
  6370. if (p0->n_threads != p1->n_threads ) return false;
  6371. if (p0->prio != p1->prio ) return false;
  6372. if (p0->poll != p1->poll ) return false;
  6373. if (p0->strict_cpu != p1->strict_cpu ) return false;
  6374. return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
  6375. }