llama-context.cpp 65 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787
  1. #include "llama-context.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include <cassert>
  5. #include <cmath>
  6. #include <cstring>
  7. #include <stdexcept>
  8. void llama_set_k_shift(struct llama_context & lctx) {
  9. const int64_t kv_size = lctx.kv_self.size;
  10. assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
  11. int32_t * data = (int32_t *) lctx.inp_K_shift->data;
  12. for (int i = 0; i < kv_size; ++i) {
  13. data[i] = lctx.kv_self.cells[i].delta;
  14. }
  15. }
  16. void llama_set_s_copy(struct llama_context & lctx) {
  17. const int64_t kv_size = lctx.kv_self.size;
  18. assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  19. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  20. for (int i = 0; i < kv_size; ++i) {
  21. data[i] = lctx.kv_self.cells[i].src;
  22. }
  23. }
  24. // llama input
  25. static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
  26. // TODO move to hparams if a T5 variant appears that uses a different value
  27. const int64_t max_distance = 128;
  28. if (bidirectional) {
  29. n_buckets >>= 1;
  30. }
  31. const int64_t max_exact = n_buckets >> 1;
  32. int32_t relative_position = x - y;
  33. int32_t relative_bucket = 0;
  34. if (bidirectional) {
  35. relative_bucket += (relative_position > 0) * n_buckets;
  36. relative_position = abs(relative_position);
  37. } else {
  38. relative_position = -std::min<int32_t>(relative_position, 0);
  39. }
  40. int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
  41. relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
  42. relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
  43. return relative_bucket;
  44. }
  45. void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
  46. //
  47. // set input data
  48. //
  49. const auto & hparams = lctx.model.hparams;
  50. const auto & cparams = lctx.cparams;
  51. const auto & kv_self = lctx.kv_self;
  52. if (ubatch.token) {
  53. const int64_t n_tokens = ubatch.n_tokens;
  54. ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
  55. }
  56. if (ubatch.embd) {
  57. if (lctx.inp_cross_attn_state && lctx.inp_cross_attn_state->buffer) {
  58. ggml_backend_tensor_set(lctx.inp_cross_attn_state, ubatch.embd, 0, ggml_nbytes(lctx.inp_cross_attn_state));
  59. // zero out inp_embd since it's not used
  60. float * inp_embd_data = (float *)lctx.inp_embd->data;
  61. for (int i = 0; i < ggml_nelements(lctx.inp_embd); ++i) {
  62. inp_embd_data[i] = 0.0f;
  63. }
  64. } else {
  65. const int64_t n_embd = hparams.n_embd;
  66. const int64_t n_tokens = ubatch.n_tokens;
  67. ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
  68. }
  69. }
  70. if (ubatch.pos && lctx.inp_pos) {
  71. const int64_t n_tokens = ubatch.n_tokens;
  72. auto n_pos = lctx.n_pos_per_token;
  73. ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
  74. }
  75. if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
  76. //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
  77. if (!lctx.inp_out_ids) {
  78. LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
  79. } else {
  80. const int64_t n_tokens = ubatch.n_tokens;
  81. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
  82. int32_t * data = (int32_t *) lctx.inp_out_ids->data;
  83. if (lctx.n_outputs == n_tokens) {
  84. for (int i = 0; i < n_tokens; ++i) {
  85. data[i] = i;
  86. }
  87. } else if (ubatch.output) {
  88. int32_t n_outputs = 0;
  89. for (int i = 0; i < n_tokens; ++i) {
  90. if (ubatch.output[i]) {
  91. data[n_outputs++] = i;
  92. }
  93. }
  94. // the graph needs to have been passed the correct number of outputs
  95. GGML_ASSERT(lctx.n_outputs == n_outputs);
  96. } else if (lctx.n_outputs == 1) {
  97. // only keep last output
  98. data[0] = n_tokens - 1;
  99. } else {
  100. GGML_ASSERT(lctx.n_outputs == 0);
  101. }
  102. }
  103. }
  104. GGML_ASSERT(
  105. // (!a || b) is a logical implication (a -> b)
  106. // !hparams.causal_attn -> !cparams.causal_attn
  107. (hparams.causal_attn || !cparams.causal_attn) &&
  108. "causal attention is not supported by this model"
  109. );
  110. if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
  111. // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
  112. if (cparams.causal_attn && !lctx.is_encoding) {
  113. const int64_t n_kv = kv_self.n;
  114. const int64_t n_tokens = ubatch.n_tokens;
  115. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  116. const int64_t n_seqs = ubatch.n_seqs;
  117. float * data = nullptr;
  118. float * data_swa = nullptr;
  119. if (lctx.inp_KQ_mask) {
  120. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  121. data = (float *) lctx.inp_KQ_mask->data;
  122. }
  123. if (lctx.inp_KQ_mask_swa) {
  124. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
  125. data_swa = (float *) lctx.inp_KQ_mask_swa->data;
  126. }
  127. // For causal attention, use only the previous KV cells
  128. // of the correct sequence for each token of the ubatch.
  129. // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
  130. for (int h = 0; h < 1; ++h) {
  131. for (int s = 0; s < n_seqs; ++s) {
  132. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  133. for (int j = 0; j < n_seq_tokens; ++j) {
  134. const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
  135. for (int i = 0; i < n_kv; ++i) {
  136. float f;
  137. if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
  138. f = -INFINITY;
  139. } else {
  140. if (hparams.use_alibi) {
  141. f = -std::abs(kv_self.cells[i].pos - pos);
  142. } else {
  143. f = 0.0f;
  144. }
  145. }
  146. if (data) {
  147. data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  148. }
  149. // may need to cut off old tokens for sliding window
  150. if (data_swa) {
  151. if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
  152. f = -INFINITY;
  153. }
  154. data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  155. }
  156. }
  157. }
  158. }
  159. if (data) {
  160. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  161. for (int j = 0; j < n_kv; ++j) {
  162. data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  163. }
  164. }
  165. }
  166. if (data_swa) {
  167. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  168. for (int j = 0; j < n_kv; ++j) {
  169. data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  170. }
  171. }
  172. }
  173. }
  174. } else {
  175. const int64_t n_tokens = ubatch.n_tokens;
  176. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  177. const int64_t n_seqs = ubatch.n_seqs;
  178. // when using kv cache, the mask needs to match the kv cache size
  179. const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
  180. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  181. float * data = (float *) lctx.inp_KQ_mask->data;
  182. for (int h = 0; h < 1; ++h) {
  183. for (int s1 = 0; s1 < n_seqs; ++s1) {
  184. const llama_seq_id seq_id = ubatch.seq_id[s1][0];
  185. for (int j = 0; j < n_seq_tokens; ++j) {
  186. const int32_t tj = s1*n_seq_tokens + j;
  187. for (int s0 = 0; s0 < n_seqs; ++s0) {
  188. for (int i = 0; i < n_seq_tokens; ++i) {
  189. const int32_t ti = s0*n_seq_tokens + i;
  190. float f = -INFINITY;
  191. for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
  192. if (ubatch.seq_id[s0][s] == seq_id) {
  193. if (hparams.use_alibi) {
  194. f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
  195. } else {
  196. f = 0.0f;
  197. }
  198. break;
  199. }
  200. }
  201. data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
  202. }
  203. }
  204. for (int i = n_tokens; i < n_stride; ++i) {
  205. data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
  206. }
  207. }
  208. }
  209. }
  210. }
  211. }
  212. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
  213. const int64_t n_tokens = ubatch.n_tokens;
  214. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  215. const int64_t n_seqs = ubatch.n_seqs;
  216. GGML_ASSERT(lctx.inp_mean);
  217. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
  218. float * data = (float *) lctx.inp_mean->data;
  219. memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
  220. std::vector<uint64_t> sum(n_tokens, 0);
  221. for (int s = 0; s < n_seqs; ++s) {
  222. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  223. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  224. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
  225. sum[seq_id] += ubatch.n_seq_tokens;
  226. }
  227. std::vector<float> div(n_tokens, 0.0f);
  228. for (int i = 0; i < n_tokens; ++i) {
  229. const uint64_t s = sum[i];
  230. if (s > 0) {
  231. div[i] = 1.0f/float(s);
  232. }
  233. }
  234. for (int s = 0; s < n_seqs; ++s) {
  235. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  236. for (int i = 0; i < n_seq_tokens; ++i) {
  237. data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
  238. }
  239. }
  240. }
  241. if (cparams.embeddings && (
  242. cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
  243. cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
  244. const int64_t n_tokens = ubatch.n_tokens;
  245. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  246. const int64_t n_seqs = ubatch.n_seqs;
  247. GGML_ASSERT(lctx.inp_cls);
  248. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  249. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  250. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  251. for (int s = 0; s < n_seqs; ++s) {
  252. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  253. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  254. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
  255. for (int i = 0; i < n_seq_tokens; ++i) {
  256. const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
  257. if (pos == 0) {
  258. data[seq_id] = s*n_seq_tokens + i;
  259. }
  260. }
  261. }
  262. }
  263. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
  264. const int64_t n_tokens = ubatch.n_tokens;
  265. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  266. const int64_t n_seqs = ubatch.n_seqs;
  267. GGML_ASSERT(lctx.inp_cls);
  268. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  269. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  270. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  271. std::vector<int> last_pos(n_tokens, -1);
  272. std::vector<int> last_row(n_tokens, -1);
  273. for (int s = 0; s < n_seqs; ++s) {
  274. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  275. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  276. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
  277. for (int i = 0; i < n_seq_tokens; ++i) {
  278. const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
  279. if (pos >= last_pos[seq_id]) {
  280. last_pos[seq_id] = pos;
  281. last_row[seq_id] = s*n_seq_tokens + i;
  282. }
  283. }
  284. }
  285. for (int i = 0; i < n_tokens; ++i) {
  286. if (last_row[i] >= 0) {
  287. data[i] = last_row[i];
  288. }
  289. }
  290. }
  291. if (kv_self.recurrent) {
  292. const int64_t n_kv = kv_self.n;
  293. if (lctx.inp_s_mask) {
  294. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
  295. float * data = (float *) lctx.inp_s_mask->data;
  296. // clear unused states
  297. for (int i = 0; i < n_kv; ++i) {
  298. const uint32_t cell_id = i + kv_self.head;
  299. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  300. data[i] = (float) (kv_cell.src >= 0);
  301. // only clear once
  302. if (kv_cell.src < 0) {
  303. kv_cell.src = cell_id;
  304. }
  305. }
  306. }
  307. if (lctx.inp_s_copy) {
  308. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  309. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  310. // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
  311. for (uint32_t i = 0; i < n_kv; ++i) {
  312. const uint32_t cell_id = i + kv_self.head;
  313. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  314. // prevent out-of-bound sources
  315. if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
  316. kv_cell.src = cell_id;
  317. }
  318. data[i] = kv_cell.src;
  319. // ensure copy only happens once
  320. if (kv_cell.src != (int32_t) cell_id) {
  321. kv_cell.src = cell_id;
  322. }
  323. }
  324. }
  325. }
  326. if (lctx.inp_pos_bucket) {
  327. const int64_t n_tokens = ubatch.n_tokens;
  328. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
  329. GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
  330. int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
  331. if (!lctx.is_encoding) {
  332. const int64_t n_kv = kv_self.n;
  333. for (int h = 0; h < 1; ++h) {
  334. for (int j = 0; j < n_tokens; ++j) {
  335. for (int i = 0; i < n_kv; ++i) {
  336. data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  337. }
  338. }
  339. }
  340. } else {
  341. for (int h = 0; h < 1; ++h) {
  342. for (int j = 0; j < n_tokens; ++j) {
  343. for (int i = 0; i < n_tokens; ++i) {
  344. data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  345. }
  346. }
  347. }
  348. }
  349. }
  350. if (!lctx.is_encoding && lctx.inp_embd_enc) {
  351. assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
  352. assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
  353. ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
  354. }
  355. if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
  356. const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
  357. const int64_t n_tokens = ubatch.n_tokens;
  358. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
  359. GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
  360. float * data = (float *) lctx.inp_KQ_mask_cross->data;
  361. for (int h = 0; h < 1; ++h) {
  362. for (int j = 0; j < n_tokens; ++j) {
  363. for (int i = 0; i < n_output_enc; ++i) {
  364. float f = -INFINITY;
  365. for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
  366. const llama_seq_id seq_id = ubatch.seq_id[j][s];
  367. if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
  368. f = 0.0f;
  369. }
  370. }
  371. data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
  372. }
  373. }
  374. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  375. for (int j = 0; j < n_output_enc; ++j) {
  376. data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
  377. }
  378. }
  379. }
  380. }
  381. }
  382. // llama output
  383. size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
  384. const auto & cparams = lctx.cparams;
  385. const auto & hparams = lctx.model.hparams;
  386. const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
  387. const auto n_batch = cparams.n_batch;
  388. const auto n_vocab = hparams.n_vocab;
  389. const auto n_embd = hparams.n_embd;
  390. // TODO: use a per-batch flag for logits presence instead
  391. const bool has_logits = cparams.causal_attn;
  392. const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
  393. const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
  394. const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
  395. if (lctx.output_ids.empty()) {
  396. // init, never resized afterwards
  397. lctx.output_ids.resize(n_batch);
  398. }
  399. const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
  400. const size_t new_size = (logits_size + embd_size) * sizeof(float);
  401. // alloc only when more than the current capacity is required
  402. // TODO: also consider shrinking the buffer
  403. if (!lctx.buf_output || prev_size < new_size) {
  404. if (lctx.buf_output) {
  405. #ifndef NDEBUG
  406. // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
  407. LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  408. #endif
  409. lctx.buf_output = nullptr;
  410. lctx.logits = nullptr;
  411. lctx.embd = nullptr;
  412. }
  413. auto * buft = ggml_backend_cpu_buffer_type();
  414. // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
  415. auto * output_dev = lctx.model.dev_output();
  416. auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
  417. if (output_dev_host_buft) {
  418. buft = output_dev_host_buft;
  419. }
  420. lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
  421. if (lctx.buf_output == nullptr) {
  422. LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
  423. return 0;
  424. }
  425. }
  426. float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
  427. lctx.logits = has_logits ? output_base : nullptr;
  428. lctx.embd = has_embd ? output_base + logits_size : nullptr;
  429. lctx.output_size = n_outputs_max;
  430. lctx.logits_size = logits_size;
  431. lctx.embd_size = embd_size;
  432. // set all ids as invalid (negative)
  433. std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
  434. ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
  435. lctx.n_outputs = 0;
  436. return n_outputs_max;
  437. }
  438. void llama_output_reorder(struct llama_context & ctx) {
  439. std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
  440. if (!out_ids.empty()) {
  441. const uint32_t n_vocab = ctx.model.hparams.n_vocab;
  442. const uint32_t n_embd = ctx.model.hparams.n_embd;
  443. const int32_t n_outputs = ctx.n_outputs;
  444. GGML_ASSERT((size_t) n_outputs == out_ids.size());
  445. // TODO: is there something more efficient which also minimizes swaps?
  446. // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
  447. for (int32_t i = 0; i < n_outputs - 1; ++i) {
  448. int32_t j_min = i;
  449. for (int32_t j = i + 1; j < n_outputs; ++j) {
  450. if (out_ids[j] < out_ids[j_min]) {
  451. j_min = j;
  452. }
  453. }
  454. if (j_min == i) { continue; }
  455. std::swap(out_ids[i], out_ids[j_min]);
  456. if (ctx.logits_size > 0) {
  457. for (uint32_t k = 0; k < n_vocab; k++) {
  458. std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
  459. }
  460. }
  461. if (ctx.embd_size > 0) {
  462. for (uint32_t k = 0; k < n_embd; k++) {
  463. std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
  464. }
  465. }
  466. }
  467. std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
  468. for (int32_t i = 0; i < n_outputs; ++i) {
  469. ctx.output_ids[out_ids[i]] = i;
  470. }
  471. out_ids.clear();
  472. }
  473. }
  474. //
  475. // interface implementation
  476. //
  477. void llama_free(struct llama_context * ctx) {
  478. delete ctx;
  479. }
  480. uint32_t llama_n_ctx(const struct llama_context * ctx) {
  481. return ctx->cparams.n_ctx;
  482. }
  483. uint32_t llama_n_batch(const struct llama_context * ctx) {
  484. return ctx->cparams.n_batch;
  485. }
  486. uint32_t llama_n_ubatch(const struct llama_context * ctx) {
  487. return ctx->cparams.n_ubatch;
  488. }
  489. uint32_t llama_n_seq_max(const struct llama_context * ctx) {
  490. return ctx->kv_self.size;
  491. }
  492. const struct llama_model * llama_get_model(const struct llama_context * ctx) {
  493. return &ctx->model;
  494. }
  495. enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
  496. return ctx->cparams.pooling_type;
  497. }
  498. void llama_attach_threadpool(
  499. struct llama_context * ctx,
  500. ggml_threadpool_t threadpool,
  501. ggml_threadpool_t threadpool_batch) {
  502. ctx->threadpool = threadpool;
  503. ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
  504. }
  505. void llama_detach_threadpool(struct llama_context * ctx) {
  506. ctx->threadpool = nullptr;
  507. ctx->threadpool_batch = nullptr;
  508. }
  509. void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
  510. ctx->cparams.n_threads = n_threads;
  511. ctx->cparams.n_threads_batch = n_threads_batch;
  512. }
  513. int32_t llama_n_threads(struct llama_context * ctx) {
  514. return ctx->cparams.n_threads;
  515. }
  516. int32_t llama_n_threads_batch(struct llama_context * ctx) {
  517. return ctx->cparams.n_threads_batch;
  518. }
  519. void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
  520. ctx->abort_callback = abort_callback;
  521. ctx->abort_callback_data = abort_callback_data;
  522. for (auto & backend : ctx->backends) {
  523. auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
  524. auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
  525. if (set_abort_callback_fn) {
  526. set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
  527. }
  528. }
  529. }
  530. void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
  531. ctx->cparams.embeddings = embeddings;
  532. }
  533. void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
  534. ctx->cparams.causal_attn = causal_attn;
  535. }
  536. void llama_set_cross_attention(struct llama_context * ctx, bool cross_attention) {
  537. ctx->cparams.cross_attn = cross_attention;
  538. }
  539. void llama_synchronize(struct llama_context * ctx) {
  540. ggml_backend_sched_synchronize(ctx->sched.get());
  541. // FIXME: if multiple single tokens are evaluated without a synchronization,
  542. // the stats will be added to the prompt evaluation stats
  543. // this should only happen when using batch size 1 to evaluate a batch
  544. // add the evaluation to the stats
  545. if (ctx->n_queued_tokens == 1) {
  546. if (!ctx->cparams.no_perf) {
  547. ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  548. }
  549. ctx->n_eval++;
  550. } else if (ctx->n_queued_tokens > 1) {
  551. if (!ctx->cparams.no_perf) {
  552. ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  553. }
  554. ctx->n_p_eval += ctx->n_queued_tokens;
  555. }
  556. // get a more accurate load time, upon first eval
  557. if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
  558. ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
  559. ctx->has_evaluated_once = true;
  560. }
  561. ctx->n_queued_tokens = 0;
  562. ctx->t_compute_start_us = 0;
  563. }
  564. float * llama_get_logits(struct llama_context * ctx) {
  565. llama_synchronize(ctx);
  566. // reorder logits for backward compatibility
  567. // TODO: maybe deprecate this
  568. llama_output_reorder(*ctx);
  569. return ctx->logits;
  570. }
  571. float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
  572. int32_t j = -1;
  573. llama_synchronize(ctx);
  574. try {
  575. if (ctx->logits == nullptr) {
  576. throw std::runtime_error("no logits");
  577. }
  578. if (i < 0) {
  579. j = ctx->n_outputs + i;
  580. if (j < 0) {
  581. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  582. }
  583. } else if ((size_t) i >= ctx->output_ids.size()) {
  584. throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
  585. } else {
  586. j = ctx->output_ids[i];
  587. }
  588. if (j < 0) {
  589. throw std::runtime_error(format("batch.logits[%d] != true", i));
  590. }
  591. if (j >= ctx->n_outputs) {
  592. // This should not happen
  593. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  594. }
  595. return ctx->logits + j*ctx->model.hparams.n_vocab;
  596. } catch (const std::exception & err) {
  597. LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
  598. #ifndef NDEBUG
  599. GGML_ABORT("fatal error");
  600. #else
  601. return nullptr;
  602. #endif
  603. }
  604. }
  605. float * llama_get_embeddings(struct llama_context * ctx) {
  606. llama_synchronize(ctx);
  607. // reorder embeddings for backward compatibility
  608. // TODO: maybe deprecate this
  609. llama_output_reorder(*ctx);
  610. return ctx->embd;
  611. }
  612. float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
  613. int32_t j = -1;
  614. llama_synchronize(ctx);
  615. try {
  616. if (ctx->embd == nullptr) {
  617. throw std::runtime_error("no embeddings");
  618. }
  619. if (i < 0) {
  620. j = ctx->n_outputs + i;
  621. if (j < 0) {
  622. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  623. }
  624. } else if ((size_t) i >= ctx->output_ids.size()) {
  625. throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
  626. } else {
  627. j = ctx->output_ids[i];
  628. }
  629. if (j < 0) {
  630. throw std::runtime_error(format("batch.logits[%d] != true", i));
  631. }
  632. if (j >= ctx->n_outputs) {
  633. // This should not happen
  634. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  635. }
  636. return ctx->embd + j*ctx->model.hparams.n_embd;
  637. } catch (const std::exception & err) {
  638. LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
  639. #ifndef NDEBUG
  640. GGML_ABORT("fatal error");
  641. #else
  642. return nullptr;
  643. #endif
  644. }
  645. }
  646. float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
  647. llama_synchronize(ctx);
  648. auto it = ctx->embd_seq.find(seq_id);
  649. if (it == ctx->embd_seq.end()) {
  650. return nullptr;
  651. }
  652. return it->second.data();
  653. }
  654. // llama state API
  655. // deprecated
  656. size_t llama_get_state_size(struct llama_context * ctx) {
  657. return llama_state_get_size(ctx);
  658. }
  659. // deprecated
  660. size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  661. return llama_state_get_data(ctx, dst, -1);
  662. }
  663. // deprecated
  664. size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
  665. return llama_state_set_data(ctx, src, -1);
  666. }
  667. // deprecated
  668. bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  669. return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  670. }
  671. // deprecated
  672. bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  673. return llama_state_save_file(ctx, path_session, tokens, n_token_count);
  674. }
  675. // TODO: replace all non-fatal assertions with returned errors or exceptions
  676. struct llama_data_write {
  677. virtual void write(const void * src, size_t size) = 0;
  678. virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
  679. virtual size_t get_size_written() = 0;
  680. virtual ~llama_data_write() = default;
  681. void write_string(const std::string & str) {
  682. uint32_t str_size = str.size();
  683. write(&str_size, sizeof(str_size));
  684. write(str.data(), str_size);
  685. }
  686. void write_model_info(const struct llama_context * ctx) {
  687. const std::string arch_str = llm_arch_name(ctx->model.arch);
  688. write_string(arch_str);
  689. // TODO: add more model-specific info which should prevent loading the session file if not identical
  690. }
  691. //void write_rng(const std::mt19937 & rng) {
  692. // std::ostringstream rng_ss;
  693. // rng_ss << rng;
  694. // const std::string & rng_str = rng_ss.str();
  695. // write_string(rng_str);
  696. //}
  697. void write_output_ids(struct llama_context * ctx) {
  698. llama_output_reorder(*ctx);
  699. const uint32_t n_outputs = ctx->n_outputs;
  700. std::vector<int32_t> output_pos;
  701. const size_t n_batch = ctx->cparams.n_batch;
  702. const auto & output_ids = ctx->output_ids;
  703. GGML_ASSERT(n_outputs <= ctx->output_size);
  704. output_pos.resize(n_outputs);
  705. // build a more compact representation of the output ids
  706. for (size_t i = 0; i < n_batch; ++i) {
  707. // map an output id to a position in the batch
  708. int32_t pos = output_ids[i];
  709. if (pos >= 0) {
  710. GGML_ASSERT((uint32_t) pos < n_outputs);
  711. output_pos[pos] = i;
  712. }
  713. }
  714. write(&n_outputs, sizeof(n_outputs));
  715. if (n_outputs) {
  716. write(output_pos.data(), n_outputs * sizeof(int32_t));
  717. }
  718. }
  719. void write_logits(const struct llama_context * ctx) {
  720. const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
  721. write(&logits_size, sizeof(logits_size));
  722. if (logits_size) {
  723. write(ctx->logits, logits_size * sizeof(float));
  724. }
  725. }
  726. void write_embeddings(const struct llama_context * ctx) {
  727. const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
  728. write(&embeddings_size, sizeof(embeddings_size));
  729. if (embeddings_size) {
  730. write(ctx->embd, embeddings_size * sizeof(float));
  731. }
  732. }
  733. void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
  734. for (const auto & range : cell_ranges) {
  735. for (uint32_t i = range.first; i < range.second; ++i) {
  736. const auto & cell = kv_self.cells[i];
  737. const llama_pos pos = cell.pos;
  738. const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
  739. write(&pos, sizeof(pos));
  740. write(&n_seq_id, sizeof(n_seq_id));
  741. if (n_seq_id) {
  742. for (auto seq_id : cell.seq_id) {
  743. write(&seq_id, sizeof(seq_id));
  744. }
  745. }
  746. }
  747. }
  748. }
  749. void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
  750. const struct llama_kv_cache & kv_self = ctx->kv_self;
  751. const struct llama_hparams & hparams = ctx->model.hparams;
  752. const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
  753. const uint32_t n_layer = hparams.n_layer;
  754. write(&v_trans, sizeof(v_trans));
  755. write(&n_layer, sizeof(n_layer));
  756. std::vector<uint8_t> tmp_buf;
  757. // Iterate and write all the keys first, each row is a cell
  758. // Get whole range at a time
  759. for (uint32_t il = 0; il < n_layer; ++il) {
  760. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  761. // Write key type
  762. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  763. write(&k_type_i, sizeof(k_type_i));
  764. // Write row size of key
  765. const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  766. write(&k_size_row, sizeof(k_size_row));
  767. // Read each range of cells of k_size length each into tmp_buf and write out
  768. for (const auto & range : cell_ranges) {
  769. const size_t range_size = range.second - range.first;
  770. const size_t buf_size = range_size * k_size_row;
  771. write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
  772. }
  773. }
  774. if (!kv_self.v_trans) {
  775. for (uint32_t il = 0; il < n_layer; ++il) {
  776. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  777. // Write value type
  778. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  779. write(&v_type_i, sizeof(v_type_i));
  780. // Write row size of value
  781. const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  782. write(&v_size_row, sizeof(v_size_row));
  783. // Read each range of cells of v_size length each into tmp_buf and write out
  784. for (const auto & range : cell_ranges) {
  785. const size_t range_size = range.second - range.first;
  786. const size_t buf_size = range_size * v_size_row;
  787. write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
  788. }
  789. }
  790. } else {
  791. // When v is transposed, we also need the element size and get the element ranges from each row
  792. const uint32_t kv_size = kv_self.size;
  793. for (uint32_t il = 0; il < n_layer; ++il) {
  794. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  795. // Write value type
  796. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  797. write(&v_type_i, sizeof(v_type_i));
  798. // Write element size
  799. const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  800. write(&v_size_el, sizeof(v_size_el));
  801. // Write GQA embedding size
  802. write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
  803. // For each row, we get the element values of each cell
  804. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  805. // Read each range of cells of v_size_el length each into tmp_buf and write out
  806. for (const auto & range : cell_ranges) {
  807. const size_t range_size = range.second - range.first;
  808. const size_t src_offset = (range.first + j * kv_size) * v_size_el;
  809. const size_t buf_size = range_size * v_size_el;
  810. write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
  811. }
  812. }
  813. }
  814. }
  815. }
  816. void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
  817. const struct llama_kv_cache & kv_self = ctx->kv_self;
  818. std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
  819. uint32_t cell_count = 0;
  820. // Count the number of cells with the specified seq_id
  821. // Find all the ranges of cells with this seq id (or all, when -1)
  822. uint32_t cell_range_begin = kv_self.size;
  823. for (uint32_t i = 0; i < kv_self.size; ++i) {
  824. const auto & cell = kv_self.cells[i];
  825. if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
  826. ++cell_count;
  827. if (cell_range_begin == kv_self.size) {
  828. cell_range_begin = i;
  829. }
  830. } else {
  831. if (cell_range_begin != kv_self.size) {
  832. cell_ranges.emplace_back(cell_range_begin, i);
  833. cell_range_begin = kv_self.size;
  834. }
  835. }
  836. }
  837. if (cell_range_begin != kv_self.size) {
  838. cell_ranges.emplace_back(cell_range_begin, kv_self.size);
  839. }
  840. // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
  841. uint32_t cell_count_check = 0;
  842. for (const auto & range : cell_ranges) {
  843. cell_count_check += range.second - range.first;
  844. }
  845. GGML_ASSERT(cell_count == cell_count_check);
  846. write(&cell_count, sizeof(cell_count));
  847. write_kv_cache_meta(kv_self, cell_ranges, seq_id);
  848. write_kv_cache_data(ctx, cell_ranges);
  849. }
  850. };
  851. struct llama_data_read {
  852. virtual const uint8_t * read(size_t size) = 0;
  853. virtual void read_to(void * dst, size_t size) = 0;
  854. virtual size_t get_size_read() = 0;
  855. virtual ~llama_data_read() = default;
  856. void read_string(std::string & str) {
  857. uint32_t str_size;
  858. read_to(&str_size, sizeof(str_size));
  859. str.assign((const char *) read(str_size), str_size);
  860. }
  861. // validate model information
  862. void read_model_info(const struct llama_context * ctx) {
  863. const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
  864. std::string arch_str;
  865. read_string(arch_str);
  866. if (cur_arch_str != arch_str) {
  867. throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
  868. }
  869. // TODO: add more info which needs to be identical but which is not verified otherwise
  870. }
  871. //void read_rng(std::mt19937 & rng) {
  872. // std::string rng_str;
  873. // read_string(rng_str);
  874. // std::istringstream rng_ss(rng_str);
  875. // rng_ss >> rng;
  876. // if (rng_ss.fail()) {
  877. // throw std::runtime_error("failed to load RNG state");
  878. // }
  879. //}
  880. void read_output_ids(struct llama_context * ctx) {
  881. std::vector<int32_t> output_pos;
  882. uint32_t n_outputs;
  883. read_to(&n_outputs, sizeof(n_outputs));
  884. if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
  885. throw std::runtime_error("could not reserve outputs");
  886. }
  887. if (n_outputs) {
  888. output_pos.resize(n_outputs);
  889. read_to(output_pos.data(), n_outputs * sizeof(int32_t));
  890. for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
  891. int32_t id = output_pos[i];
  892. if ((uint32_t) id >= ctx->cparams.n_batch) {
  893. throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
  894. }
  895. ctx->output_ids[id] = i;
  896. }
  897. ctx->n_outputs = n_outputs;
  898. }
  899. }
  900. void read_logits(struct llama_context * ctx) {
  901. uint64_t logits_size;
  902. read_to(&logits_size, sizeof(logits_size));
  903. if (ctx->logits_size < logits_size) {
  904. throw std::runtime_error("logits buffer too small");
  905. }
  906. if (logits_size) {
  907. read_to(ctx->logits, logits_size * sizeof(float));
  908. }
  909. }
  910. void read_embeddings(struct llama_context * ctx) {
  911. uint64_t embeddings_size;
  912. read_to(&embeddings_size, sizeof(embeddings_size));
  913. if (ctx->embd_size < embeddings_size) {
  914. throw std::runtime_error("embeddings buffer too small");
  915. }
  916. if (embeddings_size) {
  917. read_to(ctx->embd, embeddings_size * sizeof(float));
  918. }
  919. }
  920. bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
  921. struct llama_kv_cache & kv_self = ctx->kv_self;
  922. if (dest_seq_id != -1) {
  923. // single sequence
  924. llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
  925. llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
  926. batch.n_tokens = cell_count;
  927. batch.n_seq_tokens = cell_count;
  928. batch.n_seqs = 1;
  929. for (uint32_t i = 0; i < cell_count; ++i) {
  930. llama_pos pos;
  931. uint32_t n_seq_id;
  932. read_to(&pos, sizeof(pos));
  933. read_to(&n_seq_id, sizeof(n_seq_id));
  934. if (n_seq_id != 0) {
  935. LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
  936. return false;
  937. }
  938. batch.pos[i] = pos;
  939. }
  940. batch.n_seq_id[0] = 1;
  941. batch.seq_id[0] = &dest_seq_id;
  942. if (!llama_kv_cache_find_slot(kv_self, batch)) {
  943. LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
  944. return false;
  945. }
  946. // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
  947. // Assume that this is one contiguous block of cells
  948. GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
  949. GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
  950. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
  951. GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
  952. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
  953. } else {
  954. // whole KV cache restore
  955. if (cell_count > kv_self.size) {
  956. LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
  957. return false;
  958. }
  959. llama_kv_cache_clear(kv_self);
  960. for (uint32_t i = 0; i < cell_count; ++i) {
  961. llama_kv_cell & cell = kv_self.cells[i];
  962. llama_pos pos;
  963. uint32_t n_seq_id;
  964. read_to(&pos, sizeof(pos));
  965. read_to(&n_seq_id, sizeof(n_seq_id));
  966. cell.pos = pos;
  967. for (uint32_t j = 0; j < n_seq_id; ++j) {
  968. llama_seq_id seq_id;
  969. read_to(&seq_id, sizeof(seq_id));
  970. if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
  971. LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
  972. return false;
  973. }
  974. cell.seq_id.insert(seq_id);
  975. if (kv_self.recurrent) {
  976. int32_t & tail = kv_self.cells[seq_id].tail;
  977. if (tail != -1) {
  978. LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
  979. return false;
  980. }
  981. tail = i;
  982. }
  983. }
  984. }
  985. kv_self.head = 0;
  986. kv_self.used = cell_count;
  987. }
  988. if (kv_self.recurrent) {
  989. for (uint32_t i = 0; i < cell_count; ++i) {
  990. uint32_t cell_id = kv_self.head + i;
  991. // make sure the recurrent states will keep their restored state
  992. kv_self.cells[cell_id].src = cell_id;
  993. }
  994. }
  995. return true;
  996. }
  997. bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
  998. const struct llama_hparams & hparams = ctx->model.hparams;
  999. struct llama_kv_cache & kv_self = ctx->kv_self;
  1000. uint32_t v_trans;
  1001. uint32_t n_layer;
  1002. read_to(&v_trans, sizeof(v_trans));
  1003. read_to(&n_layer, sizeof(n_layer));
  1004. if (n_layer != hparams.n_layer) {
  1005. LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
  1006. return false;
  1007. }
  1008. if (cell_count > kv_self.size) {
  1009. LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
  1010. return false;
  1011. }
  1012. if (kv_self.v_trans != (bool) v_trans) {
  1013. LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
  1014. return false;
  1015. }
  1016. // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
  1017. for (uint32_t il = 0; il < n_layer; ++il) {
  1018. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  1019. // Read type of key
  1020. int32_t k_type_i_ref;
  1021. read_to(&k_type_i_ref, sizeof(k_type_i_ref));
  1022. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  1023. if (k_type_i != k_type_i_ref) {
  1024. LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
  1025. return false;
  1026. }
  1027. // Read row size of key
  1028. uint64_t k_size_row_ref;
  1029. read_to(&k_size_row_ref, sizeof(k_size_row_ref));
  1030. const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  1031. if (k_size_row != k_size_row_ref) {
  1032. LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
  1033. return false;
  1034. }
  1035. if (cell_count) {
  1036. // Read and set the keys for the whole cell range
  1037. ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
  1038. }
  1039. }
  1040. if (!kv_self.v_trans) {
  1041. for (uint32_t il = 0; il < n_layer; ++il) {
  1042. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  1043. // Read type of value
  1044. int32_t v_type_i_ref;
  1045. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  1046. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  1047. if (v_type_i != v_type_i_ref) {
  1048. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  1049. return false;
  1050. }
  1051. // Read row size of value
  1052. uint64_t v_size_row_ref;
  1053. read_to(&v_size_row_ref, sizeof(v_size_row_ref));
  1054. const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  1055. if (v_size_row != v_size_row_ref) {
  1056. LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
  1057. return false;
  1058. }
  1059. if (cell_count) {
  1060. // Read and set the values for the whole cell range
  1061. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
  1062. }
  1063. }
  1064. } else {
  1065. // For each layer, read the values for each cell (transposed)
  1066. for (uint32_t il = 0; il < n_layer; ++il) {
  1067. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  1068. // Read type of value
  1069. int32_t v_type_i_ref;
  1070. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  1071. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  1072. if (v_type_i != v_type_i_ref) {
  1073. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  1074. return false;
  1075. }
  1076. // Read element size of value
  1077. uint32_t v_size_el_ref;
  1078. read_to(&v_size_el_ref, sizeof(v_size_el_ref));
  1079. const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  1080. if (v_size_el != v_size_el_ref) {
  1081. LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
  1082. return false;
  1083. }
  1084. // Read GQA embedding size
  1085. uint32_t n_embd_v_gqa_ref;
  1086. read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
  1087. if (n_embd_v_gqa != n_embd_v_gqa_ref) {
  1088. LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
  1089. return false;
  1090. }
  1091. if (cell_count) {
  1092. // For each row in the transposed matrix, read the values for the whole cell range
  1093. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  1094. const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
  1095. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
  1096. }
  1097. }
  1098. }
  1099. }
  1100. return true;
  1101. }
  1102. void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
  1103. uint32_t cell_count;
  1104. read_to(&cell_count, sizeof(cell_count));
  1105. bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
  1106. if (!res) {
  1107. if (seq_id == -1) {
  1108. llama_kv_cache_clear(ctx);
  1109. } else {
  1110. llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
  1111. }
  1112. throw std::runtime_error("failed to restore kv cache");
  1113. }
  1114. }
  1115. };
  1116. struct llama_data_write_dummy : llama_data_write {
  1117. size_t size_written = 0;
  1118. llama_data_write_dummy() {}
  1119. void write(const void * /* src */, size_t size) override {
  1120. size_written += size;
  1121. }
  1122. void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
  1123. size_written += size;
  1124. }
  1125. size_t get_size_written() override {
  1126. return size_written;
  1127. }
  1128. };
  1129. struct llama_data_write_buffer : llama_data_write {
  1130. uint8_t * ptr;
  1131. size_t buf_size = 0;
  1132. size_t size_written = 0;
  1133. llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1134. void write(const void * src, size_t size) override {
  1135. if (size > buf_size) {
  1136. throw std::runtime_error("unexpectedly reached end of buffer");
  1137. }
  1138. memcpy(ptr, src, size);
  1139. ptr += size;
  1140. size_written += size;
  1141. buf_size -= size;
  1142. }
  1143. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  1144. if (size > buf_size) {
  1145. throw std::runtime_error("unexpectedly reached end of buffer");
  1146. }
  1147. ggml_backend_tensor_get(tensor, ptr, offset, size);
  1148. ptr += size;
  1149. size_written += size;
  1150. buf_size -= size;
  1151. }
  1152. size_t get_size_written() override {
  1153. return size_written;
  1154. }
  1155. };
  1156. struct llama_data_read_buffer : llama_data_read {
  1157. const uint8_t * ptr;
  1158. size_t buf_size = 0;
  1159. size_t size_read = 0;
  1160. llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1161. const uint8_t * read(size_t size) override {
  1162. const uint8_t * base_ptr = ptr;
  1163. if (size > buf_size) {
  1164. throw std::runtime_error("unexpectedly reached end of buffer");
  1165. }
  1166. ptr += size;
  1167. size_read += size;
  1168. buf_size -= size;
  1169. return base_ptr;
  1170. }
  1171. void read_to(void * dst, size_t size) override {
  1172. memcpy(dst, read(size), size);
  1173. }
  1174. size_t get_size_read() override {
  1175. return size_read;
  1176. }
  1177. };
  1178. struct llama_data_write_file : llama_data_write {
  1179. llama_file * file;
  1180. size_t size_written = 0;
  1181. std::vector<uint8_t> temp_buffer;
  1182. llama_data_write_file(llama_file * f) : file(f) {}
  1183. void write(const void * src, size_t size) override {
  1184. file->write_raw(src, size);
  1185. size_written += size;
  1186. }
  1187. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  1188. temp_buffer.resize(size);
  1189. ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
  1190. write(temp_buffer.data(), temp_buffer.size());
  1191. }
  1192. size_t get_size_written() override {
  1193. return size_written;
  1194. }
  1195. };
  1196. struct llama_data_read_file : llama_data_read {
  1197. llama_file * file;
  1198. size_t size_read = 0;
  1199. std::vector<uint8_t> temp_buffer;
  1200. llama_data_read_file(llama_file * f) : file(f) {}
  1201. void read_to(void * dst, size_t size) override {
  1202. file->read_raw(dst, size);
  1203. size_read += size;
  1204. }
  1205. const uint8_t * read(size_t size) override {
  1206. temp_buffer.resize(size);
  1207. read_to(temp_buffer.data(), size);
  1208. return temp_buffer.data();
  1209. }
  1210. size_t get_size_read() override {
  1211. return size_read;
  1212. }
  1213. };
  1214. /** copy state data into either a buffer or file depending on the passed in context
  1215. *
  1216. * file context:
  1217. * llama_file file("/path", "wb");
  1218. * llama_data_write_file data_ctx(&file);
  1219. * llama_state_get_data_internal(ctx, data_ctx);
  1220. *
  1221. * buffer context:
  1222. * std::vector<uint8_t> buf(max_size, 0);
  1223. * llama_data_write_buffer data_ctx(buf.data(), max_size);
  1224. * llama_state_get_data_internal(ctx, data_ctx);
  1225. *
  1226. */
  1227. static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
  1228. llama_synchronize(ctx);
  1229. data_ctx.write_model_info(ctx);
  1230. // copy outputs
  1231. data_ctx.write_output_ids(ctx);
  1232. data_ctx.write_logits(ctx);
  1233. data_ctx.write_embeddings(ctx);
  1234. data_ctx.write_kv_cache(ctx);
  1235. return data_ctx.get_size_written();
  1236. }
  1237. size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
  1238. llama_data_write_buffer data_ctx(dst, size);
  1239. try {
  1240. return llama_state_get_data_internal(ctx, data_ctx);
  1241. } catch (const std::exception & err) {
  1242. LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
  1243. return 0;
  1244. }
  1245. }
  1246. // Returns the *actual* size of the state.
  1247. // Intended to be used when saving to state to a buffer.
  1248. size_t llama_state_get_size(struct llama_context * ctx) {
  1249. llama_data_write_dummy data_ctx;
  1250. try {
  1251. return llama_state_get_data_internal(ctx, data_ctx);
  1252. } catch (const std::exception & err) {
  1253. LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
  1254. return 0;
  1255. }
  1256. }
  1257. static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
  1258. llama_synchronize(ctx);
  1259. data_ctx.read_model_info(ctx);
  1260. // set outputs
  1261. data_ctx.read_output_ids(ctx);
  1262. data_ctx.read_logits(ctx);
  1263. data_ctx.read_embeddings(ctx);
  1264. data_ctx.read_kv_cache(ctx);
  1265. return data_ctx.get_size_read();
  1266. }
  1267. // Sets the state reading from the specified source address
  1268. size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
  1269. llama_data_read_buffer data_ctx(src, size);
  1270. try {
  1271. return llama_state_set_data_internal(ctx, data_ctx);
  1272. } catch (const std::exception & err) {
  1273. LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
  1274. return 0;
  1275. }
  1276. }
  1277. static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1278. llama_file file(path_session, "rb");
  1279. // sanity checks
  1280. {
  1281. const uint32_t magic = file.read_u32();
  1282. const uint32_t version = file.read_u32();
  1283. if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
  1284. LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
  1285. return false;
  1286. }
  1287. }
  1288. // load the prompt
  1289. {
  1290. const uint32_t n_token_count = file.read_u32();
  1291. if (n_token_count > n_token_capacity) {
  1292. LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1293. return false;
  1294. }
  1295. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1296. *n_token_count_out = n_token_count;
  1297. }
  1298. // restore the context state
  1299. {
  1300. const size_t n_state_size_cur = file.size() - file.tell();
  1301. llama_data_read_file data_ctx(&file);
  1302. const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
  1303. if (n_read != n_state_size_cur) {
  1304. LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
  1305. return false;
  1306. }
  1307. }
  1308. return true;
  1309. }
  1310. bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1311. try {
  1312. return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  1313. } catch (const std::exception & err) {
  1314. LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
  1315. return false;
  1316. }
  1317. }
  1318. static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1319. llama_file file(path_session, "wb");
  1320. file.write_u32(LLAMA_SESSION_MAGIC);
  1321. file.write_u32(LLAMA_SESSION_VERSION);
  1322. // save the prompt
  1323. file.write_u32((uint32_t) n_token_count);
  1324. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1325. // save the context state using stream saving
  1326. llama_data_write_file data_ctx(&file);
  1327. llama_state_get_data_internal(ctx, data_ctx);
  1328. return true;
  1329. }
  1330. bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1331. try {
  1332. return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
  1333. } catch (const std::exception & err) {
  1334. LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
  1335. return false;
  1336. }
  1337. }
  1338. static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
  1339. llama_synchronize(ctx);
  1340. data_ctx.write_kv_cache(ctx, seq_id);
  1341. return data_ctx.get_size_written();
  1342. }
  1343. size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
  1344. llama_data_write_dummy data_ctx;
  1345. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1346. }
  1347. size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
  1348. llama_data_write_buffer data_ctx(dst, size);
  1349. try {
  1350. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1351. } catch (const std::exception & err) {
  1352. LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
  1353. return 0;
  1354. }
  1355. }
  1356. static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
  1357. llama_synchronize(ctx);
  1358. data_ctx.read_kv_cache(ctx, dest_seq_id);
  1359. return data_ctx.get_size_read();
  1360. }
  1361. size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
  1362. llama_data_read_buffer data_ctx(src, size);
  1363. try {
  1364. return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  1365. } catch (const std::exception & err) {
  1366. LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
  1367. return 0;
  1368. }
  1369. }
  1370. static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  1371. llama_file file(filepath, "wb");
  1372. file.write_u32(LLAMA_STATE_SEQ_MAGIC);
  1373. file.write_u32(LLAMA_STATE_SEQ_VERSION);
  1374. // save the prompt
  1375. file.write_u32((uint32_t) n_token_count);
  1376. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1377. // save the context state using stream saving
  1378. llama_data_write_file data_ctx(&file);
  1379. llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1380. const size_t res = file.tell();
  1381. GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
  1382. return res;
  1383. }
  1384. static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1385. llama_file file(filepath, "rb");
  1386. // version checks
  1387. {
  1388. const uint32_t magic = file.read_u32();
  1389. const uint32_t version = file.read_u32();
  1390. if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
  1391. LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
  1392. return 0;
  1393. }
  1394. }
  1395. // load the prompt
  1396. {
  1397. const uint32_t n_token_count = file.read_u32();
  1398. if (n_token_count > n_token_capacity) {
  1399. LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1400. return 0;
  1401. }
  1402. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1403. *n_token_count_out = n_token_count;
  1404. }
  1405. // restore the context state
  1406. {
  1407. const size_t state_size = file.size() - file.tell();
  1408. llama_data_read_file data_ctx(&file);
  1409. const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  1410. if (!nread) {
  1411. LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
  1412. return 0;
  1413. }
  1414. GGML_ASSERT(nread <= state_size);
  1415. GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
  1416. }
  1417. return file.tell();
  1418. }
  1419. size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  1420. try {
  1421. return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
  1422. } catch (const std::exception & err) {
  1423. LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
  1424. return 0;
  1425. }
  1426. }
  1427. size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1428. try {
  1429. return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
  1430. } catch (const std::exception & err) {
  1431. LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
  1432. return 0;
  1433. }
  1434. }
  1435. const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
  1436. struct llama_context * ctx
  1437. ) {
  1438. return ctx->model.tensors_by_name;
  1439. }