mllama.cpp 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906
  1. // NOTE: This is modified from clip.cpp for Mllama only
  2. #include "mllama.h"
  3. #include "ggml-alloc.h"
  4. #include "ggml-backend.h"
  5. #include "ggml.h"
  6. #ifdef GGML_USE_CUDA
  7. #include "ggml-cuda.h"
  8. #endif
  9. #ifdef GGML_USE_METAL
  10. #include "ggml-metal.h"
  11. #endif
  12. #ifdef GGML_USE_CANN
  13. #include "ggml-cann.h"
  14. #endif
  15. #ifdef GGML_USE_VULKAN
  16. #include "ggml-vulkan.h"
  17. #endif
  18. #include <algorithm>
  19. #include <cmath>
  20. #include <cstdarg>
  21. #include <cstdlib>
  22. #include <cstring>
  23. #include <fstream>
  24. #include <stdexcept>
  25. #include <vector>
  26. #define REQUIRE(x) \
  27. do { \
  28. if (!(x)) { \
  29. throw std::runtime_error("REQUIRE failed: " #x); \
  30. } \
  31. } while (0)
  32. #define LOG(fmt, ...) fprintf(stderr, "%s: " fmt "\n", __func__, ##__VA_ARGS__)
  33. #if defined(_WIN32)
  34. #define WIN32_LEAN_AND_MEAN
  35. #ifndef NOMINMAX
  36. #define NOMINMAX
  37. #endif
  38. #include <windows.h>
  39. #endif
  40. struct mllama_image {
  41. int width;
  42. int height;
  43. int num_channels = 3;
  44. int num_tiles = 4;
  45. int aspect_ratio_id;
  46. std::vector<float> data;
  47. };
  48. static std::string format(const char *fmt, ...) {
  49. va_list args;
  50. va_start(args, fmt);
  51. std::vector<char> b(128);
  52. int n = vsnprintf(b.data(), b.size(), fmt, args);
  53. REQUIRE(n >= 0 && n < b.size());
  54. va_end(args);
  55. return std::string(b.data(), b.size());
  56. }
  57. //
  58. // utilities to get data from a gguf file
  59. //
  60. static int get_key_index(const gguf_context *ctx, const char *key) {
  61. int key_index = gguf_find_key(ctx, key);
  62. REQUIRE(key_index != -1);
  63. return key_index;
  64. }
  65. static std::vector<uint32_t> get_u32_array(const gguf_context *ctx, const std::string &key) {
  66. const int i = get_key_index(ctx, key.c_str());
  67. const int n = gguf_get_arr_n(ctx, i);
  68. const uint32_t *data = (uint32_t *)gguf_get_arr_data(ctx, i);
  69. std::vector<uint32_t> s(n);
  70. for (size_t j = 0; j < s.size(); j++) {
  71. s[j] = data[j];
  72. }
  73. return s;
  74. }
  75. static uint32_t get_u32(const gguf_context *ctx, const std::string &key) {
  76. return gguf_get_val_u32(ctx, get_key_index(ctx, key.c_str()));
  77. }
  78. static float get_f32(const gguf_context *ctx, const std::string &key) {
  79. return gguf_get_val_f32(ctx, get_key_index(ctx, key.c_str()));
  80. }
  81. static std::string get_ftype(int ftype) {
  82. return ggml_type_name(static_cast<ggml_type>(ftype));
  83. }
  84. //
  85. // mllama layers
  86. //
  87. struct mllama_hparams {
  88. uint32_t image_size;
  89. uint32_t patch_size;
  90. uint32_t hidden_size;
  91. uint32_t n_intermediate;
  92. uint32_t projection_dim;
  93. uint32_t n_head;
  94. uint32_t n_layer;
  95. uint32_t n_global_layer;
  96. uint32_t n_tiles;
  97. float eps;
  98. std::vector<bool> intermediate_layers;
  99. };
  100. struct mllama_layer {
  101. // attention
  102. struct ggml_tensor *k_w;
  103. struct ggml_tensor *k_b;
  104. struct ggml_tensor *q_w;
  105. struct ggml_tensor *q_b;
  106. struct ggml_tensor *v_w;
  107. struct ggml_tensor *v_b;
  108. struct ggml_tensor *o_w;
  109. struct ggml_tensor *o_b;
  110. struct ggml_tensor *attn_gate;
  111. // layernorm 1
  112. struct ggml_tensor *ln_1_w;
  113. struct ggml_tensor *ln_1_b;
  114. // ff
  115. struct ggml_tensor *ff_i_w;
  116. struct ggml_tensor *ff_i_b;
  117. struct ggml_tensor *ff_o_w;
  118. struct ggml_tensor *ff_o_b;
  119. struct ggml_tensor *ff_gate;
  120. // layernorm 2
  121. struct ggml_tensor *ln_2_w;
  122. struct ggml_tensor *ln_2_b;
  123. };
  124. struct mllama_vision_model {
  125. struct mllama_hparams hparams;
  126. // embeddings
  127. struct ggml_tensor *class_embedding;
  128. struct ggml_tensor *patch_embeddings;
  129. struct ggml_tensor *position_embeddings;
  130. struct ggml_tensor *position_embeddings_gate;
  131. struct ggml_tensor *tile_position_embeddings;
  132. struct ggml_tensor *tile_position_embeddings_gate;
  133. struct ggml_tensor *pre_tile_position_embeddings;
  134. struct ggml_tensor *pre_tile_position_embeddings_gate;
  135. struct ggml_tensor *post_tile_position_embeddings;
  136. struct ggml_tensor *post_tile_position_embeddings_gate;
  137. struct ggml_tensor *pre_ln_w;
  138. struct ggml_tensor *pre_ln_b;
  139. std::vector<mllama_layer> layers;
  140. std::vector<mllama_layer> global_layers;
  141. struct ggml_tensor *post_ln_w;
  142. struct ggml_tensor *post_ln_b;
  143. struct ggml_tensor *mm_0_w = nullptr;
  144. struct ggml_tensor *mm_0_b = nullptr;
  145. };
  146. struct mllama_ctx {
  147. struct mllama_vision_model vision_model;
  148. uint32_t ftype = 1;
  149. struct gguf_context *ctx_gguf;
  150. struct ggml_context *ctx_data;
  151. std::vector<uint8_t> buf_compute_meta;
  152. // memory buffers to evaluate the model
  153. ggml_backend_buffer_t params_buffer = nullptr;
  154. ggml_backend_t backend = nullptr;
  155. ggml_gallocr_t compute_alloc = nullptr;
  156. };
  157. static ggml_tensor *mllama_image_build_encoder_layer(
  158. struct ggml_context *ctx0, const size_t il, const struct mllama_layer &layer, struct ggml_tensor *embeddings,
  159. const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {
  160. struct ggml_tensor *cur = embeddings;
  161. {
  162. // layernorm1
  163. cur = ggml_norm(ctx0, cur, eps);
  164. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_1_w), layer.ln_1_b);
  165. ggml_set_name(cur, format("%d pre layernorm", il).c_str());
  166. }
  167. {
  168. // self-attention
  169. struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
  170. if (layer.q_b != nullptr) {
  171. Q = ggml_add(ctx0, Q, layer.q_b);
  172. }
  173. Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
  174. Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
  175. ggml_set_name(Q, format("%d query", il).c_str());
  176. struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
  177. if (layer.k_b != nullptr) {
  178. K = ggml_add(ctx0, K, layer.k_b);
  179. }
  180. K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
  181. K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
  182. ggml_set_name(K, format("%d key", il).c_str());
  183. struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
  184. if (layer.v_b != nullptr) {
  185. V = ggml_add(ctx0, V, layer.v_b);
  186. }
  187. V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
  188. V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
  189. ggml_set_name(V, format("%d value", il).c_str());
  190. struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
  191. KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head));
  192. KQ = ggml_soft_max_inplace(ctx0, KQ);
  193. ggml_set_name(KQ, format("%d KQ", il).c_str());
  194. struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
  195. KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
  196. KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  197. KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
  198. ggml_set_name(KQV, format("%d KQV", il).c_str());
  199. cur = ggml_mul_mat(ctx0, layer.o_w, KQV);
  200. if (layer.o_b != nullptr) {
  201. cur = ggml_add(ctx0, cur, layer.o_b);
  202. }
  203. ggml_set_name(cur, format("%d self attention", il).c_str());
  204. if (layer.attn_gate != nullptr) {
  205. cur = ggml_mul_inplace(ctx0, cur, layer.attn_gate);
  206. ggml_set_name(cur, format("%d self attention gate", il).c_str());
  207. }
  208. }
  209. cur = ggml_add(ctx0, cur, embeddings);
  210. ggml_set_name(cur, format("%d residual", il).c_str());
  211. embeddings = cur;
  212. {
  213. // layernorm2
  214. cur = ggml_norm(ctx0, cur, eps);
  215. cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.ln_2_w), layer.ln_2_b);
  216. ggml_set_name(cur, format("%d post layernorm", il).c_str());
  217. }
  218. {
  219. // feed forward
  220. cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_i_w, cur), layer.ff_i_b);
  221. cur = ggml_gelu_inplace(ctx0, cur);
  222. cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ff_o_w, cur), layer.ff_o_b);
  223. ggml_set_name(cur, format("%d feed forward", il).c_str());
  224. if (layer.ff_gate != nullptr) {
  225. cur = ggml_mul_inplace(ctx0, cur, layer.ff_gate);
  226. ggml_set_name(cur, format("%d feed forward gate", il).c_str());
  227. }
  228. }
  229. // residual 2
  230. cur = ggml_add(ctx0, cur, embeddings);
  231. ggml_set_name(cur, format("%d residual", il).c_str());
  232. embeddings = cur;
  233. return embeddings;
  234. }
  235. static ggml_cgraph *mllama_image_build_graph(mllama_ctx *ctx, const mllama_image_batch *imgs) {
  236. const auto &model = ctx->vision_model;
  237. const auto &hparams = model.hparams;
  238. const int image_size = hparams.image_size;
  239. const int image_size_width = image_size;
  240. const int image_size_height = image_size;
  241. const int patch_size = hparams.patch_size;
  242. const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
  243. const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
  244. const int hidden_size = hparams.hidden_size;
  245. const int n_head = hparams.n_head;
  246. const int d_head = hidden_size / n_head;
  247. const int batch_size = imgs->size;
  248. REQUIRE(batch_size == 1);
  249. int num_tiles = 4;
  250. int num_channels = 3;
  251. if (imgs->data != nullptr) {
  252. num_tiles = imgs->data[0].num_tiles > 0 ? imgs->data[0].num_tiles : num_tiles;
  253. num_channels = imgs->data[0].num_channels > 0 ? imgs->data[0].num_channels : num_channels;
  254. }
  255. struct ggml_init_params params = {
  256. ctx->buf_compute_meta.size(), // mem_size
  257. ctx->buf_compute_meta.data(), // mem_buffer
  258. true, // no_alloc
  259. };
  260. struct ggml_context *ctx0 = ggml_init(params);
  261. struct ggml_cgraph *gf = ggml_new_graph(ctx0);
  262. struct ggml_tensor *inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, num_channels, num_tiles);
  263. ggml_set_name(inp_raw, "inp_raw");
  264. ggml_set_input(inp_raw);
  265. struct ggml_tensor *inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
  266. inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, num_tiles);
  267. inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
  268. struct ggml_tensor *aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, imgs->size);
  269. ggml_set_name(aspect_ratios, "aspect_ratios");
  270. ggml_set_input(aspect_ratios);
  271. if (model.pre_tile_position_embeddings != nullptr) {
  272. struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);
  273. ggml_set_name(pre_tile_position_embeddings, "pre_tile_position_embeddings");
  274. pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, num_tiles);
  275. if (model.pre_tile_position_embeddings_gate != nullptr) {
  276. pre_tile_position_embeddings = ggml_mul_inplace(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
  277. }
  278. inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
  279. }
  280. struct ggml_tensor *embeddings = inp;
  281. if (model.class_embedding != nullptr) {
  282. // concat class_embeddings and patch_embeddings
  283. embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, num_tiles);
  284. ggml_set_name(embeddings, "embeddings");
  285. ggml_set_input(embeddings);
  286. for (int i = 0; i < num_tiles; ++i) {
  287. // repeat class embeddings for each tile
  288. embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
  289. }
  290. embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
  291. }
  292. struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
  293. ggml_set_name(positions, "positions");
  294. ggml_set_input(positions);
  295. struct ggml_tensor *position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
  296. if (model.position_embeddings_gate != nullptr) {
  297. position_embd = ggml_mul_inplace(ctx0, position_embd, model.position_embeddings_gate);
  298. }
  299. embeddings = ggml_add(ctx0, embeddings, position_embd);
  300. if (model.tile_position_embeddings != nullptr) {
  301. struct ggml_tensor *tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
  302. ggml_set_name(tile_position_embeddings, "tile_position_embeddings");
  303. tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, num_positions, num_tiles);
  304. if (model.tile_position_embeddings_gate != nullptr) {
  305. tile_position_embeddings = ggml_mul_inplace(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
  306. }
  307. embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
  308. }
  309. // pre-layernorm
  310. if (model.pre_ln_w != nullptr) {
  311. embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_ln_w);
  312. if (model.pre_ln_b != nullptr) {
  313. embeddings = ggml_add(ctx0, embeddings, model.pre_ln_b);
  314. }
  315. ggml_set_name(embeddings, "pre layernorm");
  316. }
  317. const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;
  318. embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
  319. embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
  320. // encoder
  321. auto intermediate_layers = hparams.intermediate_layers;
  322. const auto &num_intermediate_layers = std::count(intermediate_layers.begin(), intermediate_layers.end(), true);
  323. struct ggml_tensor *intermediate_embd = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, num_intermediate_layers, hidden_size, (num_positions + num_padding_patches) * num_tiles);
  324. ggml_set_name(intermediate_embd, "intermediate_embeddings");
  325. ggml_set_input(intermediate_embd);
  326. for (size_t il = 0, s = 0; il < model.layers.size(); il++) {
  327. if (intermediate_layers[il]) {
  328. intermediate_embd = ggml_acc(
  329. ctx0, intermediate_embd,
  330. ggml_reshape_3d(ctx0, embeddings, 1, embeddings->ne[0], embeddings->ne[1]),
  331. intermediate_embd->nb[1], intermediate_embd->nb[2], intermediate_embd->nb[3], s * embeddings->nb[0]);
  332. s++;
  333. }
  334. embeddings = mllama_image_build_encoder_layer(
  335. ctx0, il, model.layers[il], embeddings,
  336. hparams.eps, hidden_size, batch_size, n_head, d_head);
  337. }
  338. // post-layernorm
  339. if (model.post_ln_w != nullptr) {
  340. embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_ln_w);
  341. if (model.post_ln_b != nullptr) {
  342. embeddings = ggml_add(ctx0, embeddings, model.post_ln_b);
  343. }
  344. ggml_set_name(embeddings, "post layernorm");
  345. }
  346. embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
  347. if (model.post_tile_position_embeddings != nullptr) {
  348. struct ggml_tensor *post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);
  349. ggml_set_name(post_tile_position_embeddings, "post_tile_position_embeddings");
  350. post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, num_tiles);
  351. if (model.post_tile_position_embeddings_gate != nullptr) {
  352. post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
  353. }
  354. embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
  355. }
  356. embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_tiles * (num_positions + num_padding_patches), 1);
  357. // global encoder
  358. for (size_t il = 0; il < model.global_layers.size(); il++) {
  359. embeddings = mllama_image_build_encoder_layer(
  360. ctx0, il, model.global_layers[il], embeddings,
  361. hparams.eps, hidden_size, batch_size, n_head, d_head);
  362. }
  363. embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, num_positions + num_padding_patches, num_tiles);
  364. embeddings = ggml_view_3d(ctx0, embeddings, hidden_size, num_positions, num_tiles, embeddings->nb[1], embeddings->nb[2], 0);
  365. intermediate_embd = ggml_reshape_3d(ctx0, intermediate_embd, intermediate_embd->ne[0] * intermediate_embd->ne[1], num_positions + num_padding_patches, num_tiles);
  366. intermediate_embd = ggml_view_3d(ctx0, intermediate_embd, intermediate_embd->ne[0], num_positions, num_tiles, intermediate_embd->nb[1], intermediate_embd->nb[2], 0);
  367. embeddings = ggml_concat(ctx0, embeddings, intermediate_embd, 0);
  368. ggml_set_name(embeddings, "cross attention states");
  369. // mllama projector
  370. embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_0_w, embeddings), model.mm_0_b);
  371. ggml_set_name(embeddings, "multi modal projector");
  372. // build the graph
  373. ggml_build_forward_expand(gf, embeddings);
  374. ggml_free(ctx0);
  375. return gf;
  376. }
  377. static struct ggml_tensor *mllama_tensor_load(struct ggml_context *ctx, const char *name, const bool optional) {
  378. struct ggml_tensor *cur = ggml_get_tensor(ctx, name);
  379. REQUIRE(cur != nullptr || optional);
  380. return cur;
  381. }
  382. static std::vector<struct mllama_layer> mllama_layers_load(struct ggml_context *ctx, const char *prefix, const int n) {
  383. std::vector<struct mllama_layer> layers(n);
  384. for (size_t i = 0; i < layers.size(); i++) {
  385. auto &layer = layers[i];
  386. layer.ln_1_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.weight", prefix, i).c_str(), false);
  387. layer.ln_1_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln1.bias", prefix, i).c_str(), false);
  388. layer.ln_2_w = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.weight", prefix, i).c_str(), false);
  389. layer.ln_2_b = mllama_tensor_load(ctx, format("%s.blk.%d.ln2.bias", prefix, i).c_str(), false);
  390. layer.k_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.weight", prefix, i).c_str(), false);
  391. layer.k_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_k.bias", prefix, i).c_str(), true);
  392. layer.q_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.weight", prefix, i).c_str(), false);
  393. layer.q_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_q.bias", prefix, i).c_str(), true);
  394. layer.v_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.weight", prefix, i).c_str(), false);
  395. layer.v_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_v.bias", prefix, i).c_str(), true);
  396. layer.o_w = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.weight", prefix, i).c_str(), false);
  397. layer.o_b = mllama_tensor_load(ctx, format("%s.blk.%d.attn_out.bias", prefix, i).c_str(), true);
  398. layer.ff_i_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.weight", prefix, i).c_str(), false);
  399. layer.ff_i_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_down.bias", prefix, i).c_str(), false);
  400. layer.ff_o_w = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.weight", prefix, i).c_str(), false);
  401. layer.ff_o_b = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_up.bias", prefix, i).c_str(), false);
  402. layer.attn_gate = mllama_tensor_load(ctx, format("%s.blk.%d.attn_gate", prefix, i).c_str(), true);
  403. layer.ff_gate = mllama_tensor_load(ctx, format("%s.blk.%d.ffn_gate", prefix, i).c_str(), true);
  404. }
  405. return layers;
  406. }
  407. // read and create ggml_context containing the tensors and their data
  408. struct mllama_ctx *mllama_model_load(const char *fname, const int verbosity = 1) {
  409. struct ggml_context *meta = nullptr;
  410. struct gguf_init_params params = {
  411. true, // no_alloc
  412. &meta, // ctx
  413. };
  414. struct gguf_context *ctx = gguf_init_from_file(fname, params);
  415. REQUIRE(ctx != nullptr);
  416. if (verbosity >= 1) {
  417. const int n_tensors = gguf_get_n_tensors(ctx);
  418. const int n_kv = gguf_get_n_kv(ctx);
  419. const std::string ftype = get_ftype(get_u32(ctx, "general.file_type"));
  420. const int idx_desc = get_key_index(ctx, "general.description");
  421. const std::string description = gguf_get_val_str(ctx, idx_desc);
  422. const int idx_name = gguf_find_key(ctx, "general.name");
  423. if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
  424. const std::string name = gguf_get_val_str(ctx, idx_name);
  425. LOG("model name: %s", name.c_str());
  426. }
  427. LOG("description: %s", description.c_str());
  428. LOG("GGUF version: %d", gguf_get_version(ctx));
  429. LOG("alignment: %zu", gguf_get_alignment(ctx));
  430. LOG("n_tensors: %d", n_tensors);
  431. LOG("n_kv: %d", n_kv);
  432. LOG("ftype: %s", ftype.c_str());
  433. LOG("");
  434. }
  435. const int n_tensors = gguf_get_n_tensors(ctx);
  436. mllama_ctx *new_mllama = new mllama_ctx{};
  437. #ifdef GGML_USE_CUDA
  438. new_mllama->backend = ggml_backend_cuda_init(0);
  439. LOG("vision using CUDA backend");
  440. #endif
  441. #ifdef GGML_USE_METAL
  442. new_mllama->backend = ggml_backend_metal_init();
  443. LOG("vision using Metal backend");
  444. #endif
  445. #ifdef GGML_USE_CANN
  446. new_mllama->backend = ggml_backend_cann_init(0);
  447. LOG("vision using CANN backend");
  448. #endif
  449. #ifdef GGML_USE_VULKAN
  450. new_mllama->backend = ggml_backend_vk_init(0);
  451. LOG("vision using Vulkan backend");
  452. #endif
  453. if (!new_mllama->backend) {
  454. new_mllama->backend = ggml_backend_cpu_init();
  455. LOG("vision using CPU backend");
  456. }
  457. // load tensors
  458. {
  459. std::vector<uint8_t> read_buf;
  460. struct ggml_init_params params = {
  461. (n_tensors + 1) * ggml_tensor_overhead(), // mem_size
  462. nullptr, // mem_buffer
  463. true, // no_alloc
  464. };
  465. new_mllama->ctx_data = ggml_init(params);
  466. if (!new_mllama->ctx_data) {
  467. LOG("ggml_init() failed");
  468. mllama_free(new_mllama);
  469. gguf_free(ctx);
  470. return nullptr;
  471. }
  472. #ifdef _WIN32
  473. int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
  474. if (!wlen) {
  475. return NULL;
  476. }
  477. wchar_t *wbuf = (wchar_t *)malloc(wlen * sizeof(wchar_t));
  478. wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wbuf, wlen);
  479. if (!wlen) {
  480. free(wbuf);
  481. return NULL;
  482. }
  483. auto fin = std::ifstream(wbuf, std::ios::binary);
  484. free(wbuf);
  485. #else
  486. auto fin = std::ifstream(fname, std::ios::binary);
  487. #endif
  488. if (!fin) {
  489. LOG("cannot open model file for loading tensors\n");
  490. mllama_free(new_mllama);
  491. gguf_free(ctx);
  492. return nullptr;
  493. }
  494. // add tensors to context
  495. for (int i = 0; i < n_tensors; ++i) {
  496. const char *name = gguf_get_tensor_name(ctx, i);
  497. struct ggml_tensor *t = ggml_get_tensor(meta, name);
  498. struct ggml_tensor *cur = ggml_dup_tensor(new_mllama->ctx_data, t);
  499. ggml_set_name(cur, name);
  500. }
  501. // alloc memory and offload data
  502. new_mllama->params_buffer = ggml_backend_alloc_ctx_tensors(new_mllama->ctx_data, new_mllama->backend);
  503. for (int i = 0; i < n_tensors; ++i) {
  504. const char *name = gguf_get_tensor_name(ctx, i);
  505. struct ggml_tensor *cur = ggml_get_tensor(new_mllama->ctx_data, name);
  506. const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
  507. fin.seekg(offset, std::ios::beg);
  508. if (!fin) {
  509. LOG("failed to seek for tensor %s\n", name);
  510. mllama_free(new_mllama);
  511. gguf_free(ctx);
  512. return nullptr;
  513. }
  514. int num_bytes = ggml_nbytes(cur);
  515. if (ggml_backend_buffer_is_host(new_mllama->params_buffer)) {
  516. // for the CPU and Metal backend, we can read directly into the tensor
  517. fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
  518. } else {
  519. // read into a temporary buffer first, then copy to device memory
  520. read_buf.resize(num_bytes);
  521. fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
  522. ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
  523. }
  524. }
  525. fin.close();
  526. }
  527. // vision model
  528. // load vision model
  529. auto &vision_model = new_mllama->vision_model;
  530. auto &hparams = vision_model.hparams;
  531. hparams.hidden_size = get_u32(ctx, "mllama.vision.embedding_length");
  532. hparams.n_head = get_u32(ctx, "mllama.vision.attention.head_count");
  533. hparams.n_intermediate = get_u32(ctx, "mllama.vision.feed_forward_length");
  534. hparams.n_layer = get_u32(ctx, "mllama.vision.block_count");
  535. hparams.n_global_layer = get_u32(ctx, "mllama.vision.global.block_count");
  536. hparams.n_tiles = get_u32(ctx, "mllama.vision.max_num_tiles");
  537. hparams.image_size = get_u32(ctx, "mllama.vision.image_size");
  538. hparams.patch_size = get_u32(ctx, "mllama.vision.patch_size");
  539. hparams.projection_dim = get_u32(ctx, "mllama.vision.projection_dim");
  540. hparams.eps = get_f32(ctx, "mllama.vision.attention.layer_norm_epsilon");
  541. std::vector<uint32_t> intermediate_layers_indices = get_u32_array(ctx, "mllama.vision.intermediate_layers_indices");
  542. hparams.intermediate_layers.resize(hparams.n_layer);
  543. for (size_t i = 0; i < intermediate_layers_indices.size(); i++) {
  544. hparams.intermediate_layers[intermediate_layers_indices[i]] = true;
  545. }
  546. if (verbosity >= 2) {
  547. LOG("");
  548. LOG("vision model hparams");
  549. LOG("image_size %d", hparams.image_size);
  550. LOG("patch_size %d", hparams.patch_size);
  551. LOG("v_hidden_size %d", hparams.hidden_size);
  552. LOG("v_n_intermediate %d", hparams.n_intermediate);
  553. LOG("v_projection_dim %d", hparams.projection_dim);
  554. LOG("v_n_head %d", hparams.n_head);
  555. LOG("v_n_layer %d", hparams.n_layer);
  556. LOG("v_n_global_layer %d", hparams.n_global_layer);
  557. LOG("v_eps %f", hparams.eps);
  558. }
  559. vision_model.class_embedding = mllama_tensor_load(new_mllama->ctx_data, "v.class_embd", true);
  560. vision_model.patch_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.patch_embd.weight", true);
  561. vision_model.position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.weight", true);
  562. vision_model.position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.position_embd.gate", true);
  563. vision_model.pre_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.weight", true);
  564. vision_model.pre_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.pre_ln.bias", true);
  565. vision_model.post_ln_w = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.weight", true);
  566. vision_model.post_ln_b = mllama_tensor_load(new_mllama->ctx_data, "v.post_ln.bias", true);
  567. vision_model.tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.weight", true);
  568. vision_model.tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.tile_position_embd.gate", true);
  569. vision_model.pre_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.weight", true);
  570. vision_model.pre_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.pre_tile_position_embd.gate", true);
  571. vision_model.post_tile_position_embeddings = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.weight", true);
  572. vision_model.post_tile_position_embeddings_gate = mllama_tensor_load(new_mllama->ctx_data, "v.post_tile_position_embd.gate", true);
  573. vision_model.mm_0_w = mllama_tensor_load(new_mllama->ctx_data, "mm.0.weight", false);
  574. vision_model.mm_0_b = mllama_tensor_load(new_mllama->ctx_data, "mm.0.bias", false);
  575. vision_model.layers = mllama_layers_load(new_mllama->ctx_data, "v", hparams.n_layer);
  576. vision_model.global_layers = mllama_layers_load(new_mllama->ctx_data, "v.global", hparams.n_global_layer);
  577. ggml_free(meta);
  578. new_mllama->ctx_gguf = ctx;
  579. {
  580. // measure mem requirement and allocate
  581. new_mllama->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
  582. new_mllama->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_mllama->backend));
  583. struct mllama_image_batch batch;
  584. batch.size = 1;
  585. ggml_cgraph *gf = mllama_image_build_graph(new_mllama, &batch);
  586. ggml_gallocr_reserve(new_mllama->compute_alloc, gf);
  587. size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_mllama->compute_alloc, 0);
  588. LOG("compute allocated memory: %.2f MB", compute_memory_buffer_size / 1024.0 / 1024.0);
  589. }
  590. return new_mllama;
  591. }
  592. struct mllama_image *mllama_image_init() {
  593. return new mllama_image();
  594. }
  595. void mllama_image_free(struct mllama_image *img) { delete img; }
  596. void mllama_image_batch_free(struct mllama_image_batch *batch) {
  597. if (batch->size > 0) {
  598. delete[] batch->data;
  599. batch->size = 0;
  600. }
  601. }
  602. bool mllama_image_load_from_data(const void *data, const int n, const int width, const int height, const int num_channels, const int num_tiles, const int aspect_ratio_id, struct mllama_image *img) {
  603. img->width = width;
  604. img->height = height;
  605. img->num_channels = num_channels;
  606. img->num_tiles = num_tiles;
  607. img->aspect_ratio_id = aspect_ratio_id;
  608. img->data.resize(n);
  609. memcpy(img->data.data(), data, n);
  610. return true;
  611. }
  612. inline int mllama(int x, int lower, int upper) {
  613. return std::max(lower, std::min(x, upper));
  614. }
  615. void mllama_free(mllama_ctx *ctx) {
  616. ggml_free(ctx->ctx_data);
  617. gguf_free(ctx->ctx_gguf);
  618. ggml_backend_buffer_free(ctx->params_buffer);
  619. ggml_backend_free(ctx->backend);
  620. ggml_gallocr_free(ctx->compute_alloc);
  621. delete ctx;
  622. }
  623. bool mllama_image_encode(struct mllama_ctx *ctx, const int n_threads, mllama_image *img, float *vec) {
  624. mllama_image_batch imgs{};
  625. imgs.size = 1;
  626. imgs.data = img;
  627. return mllama_image_batch_encode(ctx, n_threads, &imgs, vec);
  628. }
  629. bool mllama_image_batch_encode(mllama_ctx *ctx, const int n_threads, const mllama_image_batch *imgs, float *vec) {
  630. int batch_size = imgs->size;
  631. REQUIRE(batch_size == 1);
  632. // build the inference graph
  633. ggml_cgraph *gf = mllama_image_build_graph(ctx, imgs);
  634. ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
  635. // set inputs
  636. const auto &model = ctx->vision_model;
  637. const auto &hparams = model.hparams;
  638. const int image_size = hparams.image_size;
  639. int image_size_width = image_size;
  640. int image_size_height = image_size;
  641. const int patch_size = hparams.patch_size;
  642. const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
  643. const int num_positions = num_patches + (model.class_embedding == nullptr ? 0 : 1);
  644. {
  645. struct ggml_tensor *inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
  646. ggml_backend_tensor_set(inp_raw, imgs->data[0].data.data(), 0, ggml_nbytes(inp_raw));
  647. }
  648. {
  649. struct ggml_tensor *embeddings = ggml_graph_get_tensor(gf, "embeddings");
  650. if (embeddings != nullptr) {
  651. void *zeros = malloc(ggml_nbytes(embeddings));
  652. memset(zeros, 0, ggml_nbytes(embeddings));
  653. ggml_backend_tensor_set(embeddings, zeros, 0, ggml_nbytes(embeddings));
  654. free(zeros);
  655. }
  656. }
  657. {
  658. struct ggml_tensor *positions = ggml_graph_get_tensor(gf, "positions");
  659. if (positions != nullptr) {
  660. int *positions_data = (int *)malloc(ggml_nbytes(positions));
  661. for (int i = 0; i < num_positions; i++) {
  662. positions_data[i] = i;
  663. }
  664. ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
  665. free(positions_data);
  666. }
  667. }
  668. {
  669. struct ggml_tensor *aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
  670. if (aspect_ratios != nullptr) {
  671. int *aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
  672. aspect_ratios_data[0] = imgs->data[0].aspect_ratio_id;
  673. ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
  674. free(aspect_ratios_data);
  675. }
  676. }
  677. {
  678. struct ggml_tensor *intermediate_embeddings = ggml_graph_get_tensor(gf, "intermediate_embeddings");
  679. if (intermediate_embeddings != nullptr) {
  680. void *zeros = malloc(ggml_nbytes(intermediate_embeddings));
  681. memset(zeros, 0, ggml_nbytes(intermediate_embeddings));
  682. ggml_backend_tensor_set(intermediate_embeddings, zeros, 0, ggml_nbytes(intermediate_embeddings));
  683. free(zeros);
  684. }
  685. }
  686. if (ggml_backend_is_cpu(ctx->backend)) {
  687. ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
  688. }
  689. #ifdef GGML_USE_METAL
  690. if (ggml_backend_is_metal(ctx->backend)) {
  691. ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
  692. }
  693. #endif
  694. ggml_backend_graph_compute(ctx->backend, gf);
  695. // the last node is the embedding tensor
  696. struct ggml_tensor *embeddings = gf->nodes[gf->n_nodes - 1];
  697. // copy the embeddings to the location passed by the user
  698. ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
  699. return true;
  700. }
  701. int32_t mllama_image_size(const struct mllama_ctx *ctx) {
  702. return ctx->vision_model.hparams.image_size;
  703. }
  704. int32_t mllama_patch_size(const struct mllama_ctx *ctx) {
  705. return ctx->vision_model.hparams.patch_size;
  706. }
  707. int32_t mllama_hidden_size(const struct mllama_ctx *ctx) {
  708. return ctx->vision_model.hparams.hidden_size;
  709. }
  710. int mllama_n_patches(const struct mllama_ctx *ctx) {
  711. const auto &hparams = ctx->vision_model.hparams;
  712. return (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
  713. }
  714. int mllama_n_positions(const struct mllama_ctx *ctx) {
  715. return mllama_n_patches(ctx) + (ctx->vision_model.class_embedding == nullptr ? 0 : 1);
  716. }
  717. int mllama_n_tiles(const struct mllama_ctx *ctx) {
  718. return ctx->vision_model.hparams.n_tiles;
  719. }
  720. int mllama_n_embd(const struct mllama_ctx *ctx) {
  721. return ctx->vision_model.hparams.projection_dim;
  722. }
  723. size_t mllama_n_embd_bytes(const struct mllama_ctx *ctx) {
  724. return mllama_n_positions(ctx) * mllama_n_embd(ctx) * mllama_n_tiles(ctx) * sizeof(float);
  725. }