sgemm.cpp 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219
  1. // Copyright 2024 Mozilla Foundation
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining
  4. // a copy of this software and associated documentation files (the
  5. // "Software"), to deal in the Software without restriction, including
  6. // without limitation the rights to use, copy, modify, merge, publish,
  7. // distribute, sublicense, and/or sell copies of the Software, and to
  8. // permit persons to whom the Software is furnished to do so, subject to
  9. // the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be
  12. // included in all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. // SOFTWARE.
  22. //
  23. // _ _ ___ _ _ ___
  24. // | |_(_)_ _ _ _| _ ) | /_\ / __|
  25. // | _| | ' \ || | _ \ |__ / _ \\__ \.
  26. // \__|_|_||_\_, |___/____/_/ \_\___/
  27. // |__/
  28. //
  29. // BASIC LINEAR ALGEBRA SUBPROGRAMS
  30. //
  31. //
  32. // This file implements multithreaded CPU matrix multiplication for the
  33. // common contiguous use case C = Aᵀ * B. These kernels are designed to
  34. // have excellent performance[1] for matrices that fit in the CPU cache
  35. // without imposing any overhead such as cache filling or malloc calls.
  36. //
  37. // This implementation does not guarantee any upper bound with rounding
  38. // errors, which grow along with k. Our goal's to maximally exploit the
  39. // hardware for performance, and then use whatever resources remain for
  40. // improving numerical accuracy.
  41. //
  42. // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
  43. // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
  44. #if defined(__GNUC__)
  45. #pragma GCC diagnostic ignored "-Wpedantic"
  46. #pragma GCC diagnostic ignored "-Wignored-attributes"
  47. #endif
  48. #include "sgemm.h"
  49. #include "ggml-impl.h"
  50. #include "ggml-cpu-impl.h"
  51. #include "ggml-quants.h"
  52. #ifdef _MSC_VER
  53. #define NOINLINE __declspec(noinline)
  54. #else
  55. #define NOINLINE __attribute__((__noinline__))
  56. #endif
  57. #if defined(__ARM_NEON) || defined(__AVX512F__)
  58. #define VECTOR_REGISTERS 32
  59. #else
  60. #define VECTOR_REGISTERS 16
  61. #endif
  62. #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
  63. namespace {
  64. inline float unhalf(ggml_fp16_t d) {
  65. return GGML_FP16_TO_FP32(d);
  66. }
  67. ////////////////////////////////////////////////////////////////////////////////////////////////////
  68. // VECTORIZED ARITHMETIC OPERATIONS
  69. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  70. inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
  71. inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
  72. inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
  73. #endif // __SSE__
  74. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  75. inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
  76. inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
  77. inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
  78. #endif // __AVX__
  79. #if defined(__AVX512F__)
  80. inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
  81. inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
  82. inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
  83. #endif // __AVX512F__
  84. #if defined(__ARM_NEON)
  85. inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
  86. inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
  87. inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
  88. #endif // __ARM_NEON
  89. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  90. inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
  91. inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
  92. inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
  93. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  94. ////////////////////////////////////////////////////////////////////////////////////////////////////
  95. // VECTORIZED FUSED MULTIPLY ADD
  96. /**
  97. * Computes a * b + c.
  98. */
  99. template <typename T, typename U>
  100. inline U madd(T a, T b, U c) {
  101. return add(mul(a, b), c);
  102. }
  103. #if defined(__FMA__)
  104. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  105. template <>
  106. inline __m256 madd(__m256 a, __m256 b, __m256 c) {
  107. return _mm256_fmadd_ps(a, b, c);
  108. }
  109. #endif
  110. #if defined(__AVX512F__)
  111. template <>
  112. inline __m512 madd(__m512 a, __m512 b, __m512 c) {
  113. return _mm512_fmadd_ps(a, b, c);
  114. }
  115. #endif
  116. #endif
  117. #if defined(__ARM_FEATURE_FMA)
  118. template <>
  119. inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
  120. return vfmaq_f32(c, b, a);
  121. }
  122. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  123. template <>
  124. inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
  125. return vfmaq_f16(c, b, a);
  126. }
  127. #endif
  128. #endif
  129. ////////////////////////////////////////////////////////////////////////////////////////////////////
  130. // VECTORIZED HORIZONTAL SUM
  131. #if defined(__ARM_NEON)
  132. inline float hsum(float32x4_t x) {
  133. return vaddvq_f32(x);
  134. }
  135. #endif // __ARM_NEON
  136. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  137. inline float hsum(float16x8_t x) {
  138. return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
  139. vcvt_f32_f16(vget_high_f16(x))));
  140. }
  141. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  142. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  143. inline float hsum(__m128 x) {
  144. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  145. x = _mm_add_ps(x, _mm_movehl_ps(x, x));
  146. x = _mm_add_ss(x, _mm_movehdup_ps(x));
  147. #else
  148. __m128 t;
  149. t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
  150. x = _mm_add_ps(x, t);
  151. t = _mm_movehl_ps(t, x);
  152. x = _mm_add_ss(x, t);
  153. #endif
  154. return _mm_cvtss_f32(x);
  155. }
  156. #endif
  157. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  158. inline float hsum(__m256 x) {
  159. return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
  160. _mm256_castps256_ps128(x)));
  161. }
  162. #endif // __AVX__
  163. #if defined(__AVX512F__)
  164. inline float hsum(__m512 x) {
  165. return _mm512_reduce_add_ps(x);
  166. }
  167. #endif // __AVX512F__
  168. ////////////////////////////////////////////////////////////////////////////////////////////////////
  169. // VECTORIZED MEMORY LOADING
  170. template <typename T, typename U> T load(const U *);
  171. #if defined(__ARM_NEON)
  172. template <> inline float32x4_t load(const float *p) {
  173. return vld1q_f32(p);
  174. }
  175. #if !defined(_MSC_VER)
  176. template <> inline float16x8_t load(const ggml_fp16_t *p) {
  177. return vld1q_f16((const float16_t *)p);
  178. }
  179. template <> inline float32x4_t load(const ggml_fp16_t *p) {
  180. return vcvt_f32_f16(vld1_f16((const float16_t *)p));
  181. }
  182. #endif // _MSC_VER
  183. #endif // __ARM_NEON
  184. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  185. template <> inline __m128 load(const float *p) {
  186. return _mm_loadu_ps(p);
  187. }
  188. #endif // __SSE__
  189. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  190. template <> inline __m256 load(const float *p) {
  191. return _mm256_loadu_ps(p);
  192. }
  193. #endif // __AVX__
  194. #if defined(__F16C__)
  195. template <> inline __m256 load(const ggml_fp16_t *p) {
  196. return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
  197. }
  198. #endif // __F16C__
  199. #if defined(__AVX512F__)
  200. template <> inline __m512 load(const float *p) {
  201. return _mm512_loadu_ps(p);
  202. }
  203. template <> inline __m512 load(const ggml_fp16_t *p) {
  204. return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
  205. }
  206. #endif // __AVX512F__
  207. ////////////////////////////////////////////////////////////////////////////////////////////////////
  208. // CONSTANTS
  209. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  210. static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  211. static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
  212. #endif
  213. ////////////////////////////////////////////////////////////////////////////////////////////////////
  214. // FLOATING POINT MATRIX MULTIPLICATION
  215. template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
  216. class tinyBLAS {
  217. public:
  218. tinyBLAS(int64_t k,
  219. const TA *A, int64_t lda,
  220. const TB *B, int64_t ldb,
  221. TC *C, int64_t ldc,
  222. int ith, int nth)
  223. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  224. }
  225. void matmul(int64_t m, int64_t n) {
  226. mnpack(0, m, 0, n);
  227. }
  228. private:
  229. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  230. int64_t mc, nc, mp, np;
  231. switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
  232. #if VECTOR_REGISTERS == 32
  233. case 0x55:
  234. mc = 5;
  235. nc = 5;
  236. gemm<5, 5>(m0, m, n0, n);
  237. break;
  238. case 0x45:
  239. mc = 4;
  240. nc = 5;
  241. gemm<4, 5>(m0, m, n0, n);
  242. break;
  243. case 0x54:
  244. mc = 5;
  245. nc = 4;
  246. gemm<5, 4>(m0, m, n0, n);
  247. break;
  248. case 0x44:
  249. mc = 4;
  250. nc = 4;
  251. gemm<4, 4>(m0, m, n0, n);
  252. break;
  253. case 0x53:
  254. mc = 5;
  255. nc = 3;
  256. gemm<5, 3>(m0, m, n0, n);
  257. break;
  258. case 0x35:
  259. mc = 3;
  260. nc = 5;
  261. gemm<3, 5>(m0, m, n0, n);
  262. break;
  263. case 0x43:
  264. mc = 4;
  265. nc = 3;
  266. gemm<4, 3>(m0, m, n0, n);
  267. break;
  268. #else
  269. case 0x55:
  270. case 0x54:
  271. case 0x53:
  272. case 0x45:
  273. case 0x44:
  274. case 0x43:
  275. mc = 4;
  276. nc = 3;
  277. gemm<4, 3>(m0, m, n0, n);
  278. break;
  279. case 0x35:
  280. #endif
  281. case 0x34:
  282. mc = 3;
  283. nc = 4;
  284. gemm<3, 4>(m0, m, n0, n);
  285. break;
  286. case 0x52:
  287. mc = 5;
  288. nc = 2;
  289. gemm<5, 2>(m0, m, n0, n);
  290. break;
  291. case 0x33:
  292. mc = 3;
  293. nc = 3;
  294. gemm<3, 3>(m0, m, n0, n);
  295. break;
  296. case 0x25:
  297. mc = 2;
  298. nc = 5;
  299. gemm<2, 5>(m0, m, n0, n);
  300. break;
  301. case 0x42:
  302. mc = 4;
  303. nc = 2;
  304. gemm<4, 2>(m0, m, n0, n);
  305. break;
  306. case 0x24:
  307. mc = 2;
  308. nc = 4;
  309. gemm<2, 4>(m0, m, n0, n);
  310. break;
  311. case 0x32:
  312. mc = 3;
  313. nc = 2;
  314. gemm<3, 2>(m0, m, n0, n);
  315. break;
  316. case 0x23:
  317. mc = 2;
  318. nc = 3;
  319. gemm<2, 3>(m0, m, n0, n);
  320. break;
  321. case 0x51:
  322. mc = 5;
  323. nc = 1;
  324. gemm<5, 1>(m0, m, n0, n);
  325. break;
  326. case 0x41:
  327. mc = 4;
  328. nc = 1;
  329. gemm<4, 1>(m0, m, n0, n);
  330. break;
  331. case 0x22:
  332. mc = 2;
  333. nc = 2;
  334. gemm<2, 2>(m0, m, n0, n);
  335. break;
  336. case 0x15:
  337. mc = 1;
  338. nc = 5;
  339. gemm<1, 5>(m0, m, n0, n);
  340. break;
  341. case 0x14:
  342. mc = 1;
  343. nc = 4;
  344. gemm<1, 4>(m0, m, n0, n);
  345. break;
  346. case 0x31:
  347. mc = 3;
  348. nc = 1;
  349. gemm<3, 1>(m0, m, n0, n);
  350. break;
  351. case 0x13:
  352. mc = 1;
  353. nc = 3;
  354. gemm<1, 3>(m0, m, n0, n);
  355. break;
  356. case 0x21:
  357. mc = 2;
  358. nc = 1;
  359. gemm<2, 1>(m0, m, n0, n);
  360. break;
  361. case 0x12:
  362. mc = 1;
  363. nc = 2;
  364. gemm<1, 2>(m0, m, n0, n);
  365. break;
  366. case 0x11:
  367. mc = 1;
  368. nc = 1;
  369. gemm<1, 1>(m0, m, n0, n);
  370. break;
  371. default:
  372. return;
  373. }
  374. mp = m0 + (m - m0) / mc * mc;
  375. np = n0 + (n - n0) / nc * nc;
  376. mnpack(mp, m, n0, np);
  377. mnpack(m0, m, np, n);
  378. }
  379. template <int RM, int RN>
  380. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  381. int64_t ytiles = (m - m0) / RM;
  382. int64_t xtiles = (n - n0) / RN;
  383. int64_t tiles = xtiles * ytiles;
  384. int64_t duty = (tiles + nth - 1) / nth;
  385. int64_t start = duty * ith;
  386. int64_t end = start + duty;
  387. if (end > tiles)
  388. end = tiles;
  389. for (int64_t job = start; job < end; ++job) {
  390. int64_t ii = m0 + job / xtiles * RM;
  391. int64_t jj = n0 + job % xtiles * RN;
  392. D Cv[RN][RM] = {};
  393. for (int64_t l = 0; l < k; l += KN)
  394. for (int64_t j = 0; j < RN; ++j)
  395. for (int64_t i = 0; i < RM; ++i)
  396. Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
  397. load<V>(B + ldb * (jj + j) + l),
  398. Cv[j][i]);
  399. for (int64_t j = 0; j < RN; ++j)
  400. for (int64_t i = 0; i < RM; ++i)
  401. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  402. }
  403. }
  404. const TA *const A;
  405. const TB *const B;
  406. TC *const C;
  407. const int64_t k;
  408. const int64_t lda;
  409. const int64_t ldb;
  410. const int64_t ldc;
  411. const int ith;
  412. const int nth;
  413. };
  414. //////////////////////////////////////////////////////////////////////////////////////////
  415. // QUANT ZERO MATRIX MULTIPLICATION
  416. #if defined(__ARM_FEATURE_DOTPROD)
  417. template <typename TA>
  418. class tinyBLAS_Q0_ARM {
  419. public:
  420. tinyBLAS_Q0_ARM(int64_t k,
  421. const TA *A, int64_t lda,
  422. const block_q8_0 *B, int64_t ldb,
  423. float *C, int64_t ldc,
  424. int ith, int nth)
  425. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  426. }
  427. void matmul(int64_t m, int64_t n) {
  428. mnpack(0, m, 0, n);
  429. }
  430. private:
  431. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  432. int64_t mc, nc, mp, np;
  433. switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
  434. case 0x33:
  435. mc = 3;
  436. nc = 3;
  437. gemm<3, 3>(m0, m, n0, n);
  438. break;
  439. case 0x32:
  440. mc = 3;
  441. nc = 2;
  442. gemm<3, 2>(m0, m, n0, n);
  443. break;
  444. case 0x23:
  445. mc = 2;
  446. nc = 3;
  447. gemm<2, 3>(m0, m, n0, n);
  448. break;
  449. case 0x22:
  450. mc = 2;
  451. nc = 2;
  452. gemm<2, 2>(m0, m, n0, n);
  453. break;
  454. case 0x31:
  455. mc = 3;
  456. nc = 1;
  457. gemm<3, 1>(m0, m, n0, n);
  458. break;
  459. case 0x13:
  460. mc = 1;
  461. nc = 3;
  462. gemm<1, 3>(m0, m, n0, n);
  463. break;
  464. case 0x21:
  465. mc = 2;
  466. nc = 1;
  467. gemm<2, 1>(m0, m, n0, n);
  468. break;
  469. case 0x12:
  470. mc = 1;
  471. nc = 2;
  472. gemm<1, 2>(m0, m, n0, n);
  473. break;
  474. case 0x11:
  475. mc = 1;
  476. nc = 1;
  477. gemm<1, 1>(m0, m, n0, n);
  478. break;
  479. default:
  480. return;
  481. }
  482. mp = m0 + (m - m0) / mc * mc;
  483. np = n0 + (n - n0) / nc * nc;
  484. mnpack(mp, m, n0, np);
  485. mnpack(m0, m, np, n);
  486. }
  487. template <int RM, int RN>
  488. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  489. int64_t ytiles = (m - m0) / RM;
  490. int64_t xtiles = (n - n0) / RN;
  491. int64_t tiles = xtiles * ytiles;
  492. int64_t duty = (tiles + nth - 1) / nth;
  493. int64_t start = duty * ith;
  494. int64_t end = start + duty;
  495. if (end > tiles)
  496. end = tiles;
  497. for (int64_t job = start; job < end; ++job) {
  498. int64_t ii = m0 + job / xtiles * RM;
  499. int64_t jj = n0 + job % xtiles * RN;
  500. float32x4_t Cv[RN][RM] = {};
  501. for (int64_t l = 0; l < k; ++l)
  502. for (int64_t j = 0; j < RN; ++j)
  503. for (int64_t i = 0; i < RM; ++i)
  504. Cv[j][i] = vmlaq_n_f32(Cv[j][i],
  505. vcvtq_f32_s32(vdotq_s32(
  506. vdotq_s32(vdupq_n_s32(0),
  507. load_lo(A + lda * (ii + i) + l),
  508. load_lo(B + ldb * (jj + j) + l)),
  509. load_hi(A + lda * (ii + i) + l),
  510. load_hi(B + ldb * (jj + j) + l))),
  511. unhalf(A[lda * (ii + i) + l].d) *
  512. unhalf(B[ldb * (jj + j) + l].d));
  513. for (int64_t j = 0; j < RN; ++j)
  514. for (int64_t i = 0; i < RM; ++i)
  515. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  516. }
  517. }
  518. inline int8x16_t load_lo(const block_q8_0 *b) {
  519. return vld1q_s8(b->qs);
  520. }
  521. inline int8x16_t load_hi(const block_q8_0 *b) {
  522. return vld1q_s8(b->qs + 16);
  523. }
  524. inline int8x16_t load_lo(const block_q4_0 *b) {
  525. return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
  526. vdupq_n_u8(0x0f))),
  527. vdupq_n_s8(0x8));
  528. }
  529. inline int8x16_t load_hi(const block_q4_0 *b) {
  530. return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
  531. vdupq_n_s8(0x8));
  532. }
  533. const TA *const A;
  534. const block_q8_0 *const B;
  535. float *const C;
  536. const int64_t k;
  537. const int64_t lda;
  538. const int64_t ldb;
  539. const int64_t ldc;
  540. const int ith;
  541. const int nth;
  542. };
  543. #endif // __ARM_FEATURE_DOTPROD
  544. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  545. template <typename TA, typename TB, typename TC>
  546. class tinyBLAS_Q0_AVX {
  547. public:
  548. tinyBLAS_Q0_AVX(int64_t k,
  549. const TA *A, int64_t lda,
  550. const TB *B, int64_t ldb,
  551. TC *C, int64_t ldc,
  552. int ith, int nth)
  553. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  554. }
  555. void matmul(int64_t m, int64_t n) {
  556. mnpack(0, m, 0, n);
  557. }
  558. private:
  559. void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  560. int64_t mc, nc, mp, np;
  561. switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
  562. #if VECTOR_REGISTERS == 32
  563. case 0x44:
  564. mc = 4;
  565. nc = 4;
  566. #if defined(__AVX2__) && defined(__F16C__)
  567. gemm4xN<4>(m0, m, n0, n);
  568. #else
  569. gemm<4, 4>(m0, m, n0, n);
  570. #endif
  571. break;
  572. case 0x43:
  573. mc = 4;
  574. nc = 3;
  575. #if defined(__AVX2__) && defined(__F16C__)
  576. gemm4xN<3>(m0, m, n0, n);
  577. #else
  578. gemm<4, 3>(m0, m, n0, n);
  579. #endif
  580. break;
  581. case 0x34:
  582. mc = 3;
  583. nc = 4;
  584. #if defined(__AVX2__) && defined(__F16C__)
  585. gemmMx4<3>(m0, m, n0, n);
  586. #else
  587. gemm<3, 4>(m0, m, n0, n);
  588. #endif
  589. break;
  590. case 0x33:
  591. mc = 3;
  592. nc = 3;
  593. gemm<3, 3>(m0, m, n0, n);
  594. break;
  595. case 0x42:
  596. mc = 4;
  597. nc = 2;
  598. #if defined(__AVX2__) && defined(__F16C__)
  599. gemm4xN<2>(m0, m, n0, n);
  600. #else
  601. gemm<4, 2>(m0, m, n0, n);
  602. #endif
  603. break;
  604. case 0x24:
  605. mc = 2;
  606. nc = 4;
  607. #if defined(__AVX2__) && defined(__F16C__)
  608. gemmMx4<2>(m0, m, n0, n);
  609. #else
  610. gemm<2, 4>(m0, m, n0, n);
  611. #endif
  612. break;
  613. #else
  614. case 0x44:
  615. case 0x43:
  616. case 0x42:
  617. mc = 4;
  618. nc = 2;
  619. #if defined(__AVX2__) && defined(__F16C__)
  620. gemm4xN<2>(m0, m, n0, n);
  621. #else
  622. gemm<4, 2>(m0, m, n0, n);
  623. #endif
  624. break;
  625. case 0x34:
  626. case 0x24:
  627. mc = 2;
  628. nc = 4;
  629. #if defined(__AVX2__) && defined(__F16C__)
  630. gemmMx4<2>(m0, m, n0, n);
  631. #else
  632. gemm<2, 4>(m0, m, n0, n);
  633. #endif
  634. break;
  635. case 0x33:
  636. #endif
  637. case 0x32:
  638. mc = 3;
  639. nc = 2;
  640. gemm<3, 2>(m0, m, n0, n);
  641. break;
  642. case 0x23:
  643. mc = 2;
  644. nc = 3;
  645. gemm<2, 3>(m0, m, n0, n);
  646. break;
  647. case 0x41:
  648. mc = 4;
  649. nc = 1;
  650. #if defined(__AVX2__) && defined(__F16C__)
  651. gemm4xN<1>(m0, m, n0, n);
  652. #else
  653. gemm<4, 1>(m0, m, n0, n);
  654. #endif
  655. break;
  656. case 0x22:
  657. mc = 2;
  658. nc = 2;
  659. gemm<2, 2>(m0, m, n0, n);
  660. break;
  661. case 0x14:
  662. mc = 1;
  663. nc = 4;
  664. #if defined(__AVX2__) && defined(__F16C__)
  665. gemmMx4<1>(m0, m, n0, n);
  666. #else
  667. gemm<1, 4>(m0, m, n0, n);
  668. #endif
  669. break;
  670. case 0x31:
  671. mc = 3;
  672. nc = 1;
  673. gemm<3, 1>(m0, m, n0, n);
  674. break;
  675. case 0x13:
  676. mc = 1;
  677. nc = 3;
  678. gemm<1, 3>(m0, m, n0, n);
  679. break;
  680. case 0x21:
  681. mc = 2;
  682. nc = 1;
  683. gemm<2, 1>(m0, m, n0, n);
  684. break;
  685. case 0x12:
  686. mc = 1;
  687. nc = 2;
  688. gemm<1, 2>(m0, m, n0, n);
  689. break;
  690. case 0x11:
  691. mc = 1;
  692. nc = 1;
  693. gemm<1, 1>(m0, m, n0, n);
  694. break;
  695. default:
  696. return;
  697. }
  698. mp = m0 + (m - m0) / mc * mc;
  699. np = n0 + (n - n0) / nc * nc;
  700. mnpack(mp, m, n0, np);
  701. mnpack(m0, m, np, n);
  702. }
  703. #if defined(__AVX2__) && defined(__F16C__)
  704. // Templated functions for gemm of dimensions 4xN
  705. template <int RN>
  706. NOINLINE void gemm4xN(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  707. int64_t ytiles = (m - m0) / 4;
  708. int64_t xtiles = (n - n0) / RN;
  709. int64_t tiles = xtiles * ytiles;
  710. int64_t duty = (tiles + nth - 1) / nth;
  711. int64_t start = duty * ith;
  712. int64_t end = start + duty;
  713. if (end > tiles)
  714. end = tiles;
  715. for (int64_t job = start; job < end; ++job) {
  716. int64_t ii = m0 + job / xtiles * 4;
  717. int64_t jj = n0 + job % xtiles * RN;
  718. __m256 Cv[RN][4] = {};
  719. for (int64_t l = 0; l < k; ++l) {
  720. uint64_t a_delta = ((uint64_t)A[lda * (ii + 3) + l].d << 48) | ((uint64_t)A[lda * (ii + 2) + l].d << 32) | ((uint64_t)A[lda * (ii + 1) + l].d << 16) | (A[lda * (ii + 0) + l].d);
  721. // Convert delta values for four blocks to float values
  722. __m128 da = _mm_cvtph_ps(_mm_set_epi64x(0, a_delta));
  723. __m256i avec0 = load(A + lda * (ii + 0) + l);
  724. __m256i avec1 = load(A + lda * (ii + 1) + l);
  725. __m256i avec2 = load(A + lda * (ii + 2) + l);
  726. __m256i avec3 = load(A + lda * (ii + 3) + l);
  727. for (int64_t j = 0; j < RN; ++j) {
  728. __m128 db = _mm_set1_ps(unhalf(B[ldb * (jj + j) + l].d));
  729. // Computation of product of delta values for four blocks and replicate it across 256 bit lane
  730. __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
  731. dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
  732. // Computation of dot product and multiplication with appropriate delta value products
  733. Cv[j][0] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
  734. updot(_mm256_sign_epi8(avec0, avec0),
  735. _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec0)),
  736. Cv[j][0]);
  737. Cv[j][1] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
  738. updot(_mm256_sign_epi8(avec1, avec1),
  739. _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec1)),
  740. Cv[j][1]);
  741. Cv[j][2] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
  742. updot(_mm256_sign_epi8(avec2, avec2),
  743. _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec2)),
  744. Cv[j][2]);
  745. Cv[j][3] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
  746. updot(_mm256_sign_epi8(avec3, avec3),
  747. _mm256_sign_epi8(load(B + ldb * (jj + j) + l), avec3)),
  748. Cv[j][3]);
  749. }
  750. }
  751. for (int64_t j = 0; j < RN; ++j)
  752. for (int64_t i = 0; i < 4; ++i)
  753. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  754. }
  755. }
  756. // Templated functions for gemm of dimensions Mx4
  757. template <int RM>
  758. NOINLINE void gemmMx4(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  759. int64_t ytiles = (m - m0) / RM;
  760. int64_t xtiles = (n - n0) / 4;
  761. int64_t tiles = xtiles * ytiles;
  762. int64_t duty = (tiles + nth - 1) / nth;
  763. int64_t start = duty * ith;
  764. int64_t end = start + duty;
  765. if (end > tiles)
  766. end = tiles;
  767. for (int64_t job = start; job < end; ++job) {
  768. int64_t ii = m0 + job / xtiles * RM;
  769. int64_t jj = n0 + job % xtiles * 4;
  770. __m256 Cv[4][RM] = {};
  771. for (int64_t l = 0; l < k; ++l) {
  772. uint64_t b_delta = ((uint64_t)B[ldb * (jj + 3) + l].d << 48) | ((uint64_t)B[ldb * (jj + 2) + l].d << 32) | ((uint64_t)B[ldb * (jj + 1) + l].d << 16) | (B[ldb * (jj + 0) + l].d);
  773. // Convert delta values for four blocks to float values
  774. __m128 db = _mm_cvtph_ps(_mm_set_epi64x(0, b_delta));
  775. __m256i bvec0 = load(B + ldb * (jj + 0) + l);
  776. __m256i bvec1 = load(B + ldb * (jj + 1) + l);
  777. __m256i bvec2 = load(B + ldb * (jj + 2) + l);
  778. __m256i bvec3 = load(B + ldb * (jj + 3) + l);
  779. for (int64_t i = 0; i < RM; ++i) {
  780. __m128 da = _mm_set1_ps(unhalf((A[lda * (ii + i) + l].d)));
  781. // Computation of product of delta values for four blocks and replicate it across 256 bit lane
  782. __m256 dvec = _mm256_castps128_ps256(_mm_mul_ps(da, db));
  783. dvec = _mm256_permute2f128_ps(dvec ,dvec, 0);
  784. // Computation of dot product and multiplication with appropriate delta value products
  785. Cv[0][i] = madd(_mm256_shuffle_ps(dvec, dvec, 0),
  786. updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  787. load(A + lda * (ii + i) + l)),
  788. _mm256_sign_epi8(bvec0, load(A + lda * (ii + i) + l))),
  789. Cv[0][i]);
  790. Cv[1][i] = madd(_mm256_shuffle_ps(dvec, dvec, 85),
  791. updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  792. load(A + lda * (ii + i) + l)),
  793. _mm256_sign_epi8(bvec1, load(A + lda * (ii + i) + l))),
  794. Cv[1][i]);
  795. Cv[2][i] = madd(_mm256_shuffle_ps(dvec, dvec, 170),
  796. updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  797. load(A + lda * (ii + i) + l)),
  798. _mm256_sign_epi8(bvec2, load(A + lda * (ii + i) + l))),
  799. Cv[2][i]);
  800. Cv[3][i] = madd(_mm256_shuffle_ps(dvec, dvec, 255),
  801. updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  802. load(A + lda * (ii + i) + l)),
  803. _mm256_sign_epi8(bvec3, load(A + lda * (ii + i) + l))),
  804. Cv[3][i]);
  805. }
  806. }
  807. for (int64_t j = 0; j < 4; ++j)
  808. for (int64_t i = 0; i < RM; ++i)
  809. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  810. }
  811. }
  812. #endif
  813. template <int RM, int RN>
  814. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  815. int64_t ytiles = (m - m0) / RM;
  816. int64_t xtiles = (n - n0) / RN;
  817. int64_t tiles = xtiles * ytiles;
  818. int64_t duty = (tiles + nth - 1) / nth;
  819. int64_t start = duty * ith;
  820. int64_t end = start + duty;
  821. if (end > tiles)
  822. end = tiles;
  823. for (int64_t job = start; job < end; ++job) {
  824. int64_t ii = m0 + job / xtiles * RM;
  825. int64_t jj = n0 + job % xtiles * RN;
  826. __m256 Cv[RN][RM] = {};
  827. for (int64_t l = 0; l < k; ++l)
  828. for (int64_t j = 0; j < RN; ++j)
  829. for (int64_t i = 0; i < RM; ++i) {
  830. #if defined(__AVX2__)
  831. __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  832. load(A + lda * (ii + i) + l)),
  833. _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
  834. load(A + lda * (ii + i) + l)));
  835. #else
  836. __m128i ali0 = load0(A + lda * (ii + i) + l);
  837. __m128i ali1 = load1(A + lda * (ii + i) + l);
  838. __m128i blj0 = load0(B + ldb * (jj + j) + l);
  839. __m128i blj1 = load1(B + ldb * (jj + j) + l);
  840. __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
  841. __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
  842. __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
  843. __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
  844. // updot
  845. const __m128i oneFill = _mm_set1_epi16(1);
  846. __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
  847. __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
  848. __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
  849. #endif
  850. Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
  851. unhalf(B[ldb * (jj + j) + l].d)),
  852. udTmp,
  853. Cv[j][i]);
  854. }
  855. for (int64_t j = 0; j < RN; ++j)
  856. for (int64_t i = 0; i < RM; ++i)
  857. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  858. }
  859. }
  860. inline __m256i load(const block_q8_0 *b) {
  861. return _mm256_loadu_si256((const __m256i *)b->qs);
  862. }
  863. inline __m128i load0(const block_q8_0 *b) {
  864. return _mm_loadu_si128((const __m128i *)b->qs);
  865. }
  866. inline __m128i load1(const block_q8_0 *b) {
  867. return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
  868. }
  869. inline __m256i load(const block_q4_0 *b) {
  870. return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
  871. }
  872. inline __m128i load0(const block_q4_0 *b) {
  873. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  874. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
  875. }
  876. inline __m128i load1(const block_q4_0 *b) {
  877. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  878. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
  879. }
  880. inline __m256i load(const block_iq4_nl *b) {
  881. return MM256_SET_M128I(load1(b), load0(b));
  882. }
  883. inline __m128i load0(const block_iq4_nl *b) {
  884. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  885. return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
  886. }
  887. inline __m128i load1(const block_iq4_nl *b) {
  888. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  889. return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
  890. }
  891. inline __m256 updot(__m256i u, __m256i s) {
  892. __m256i res;
  893. #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
  894. res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
  895. #else
  896. res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
  897. #endif
  898. return _mm256_cvtepi32_ps(res);
  899. }
  900. static inline __m256i denibble(const uint8_t *p) {
  901. __m128i x = _mm_loadu_si128((const __m128i *)p);
  902. return _mm256_and_si256(_mm256_set1_epi8(15),
  903. _mm256_insertf128_si256(_mm256_castsi128_si256(x),
  904. _mm_srli_epi16(x, 4), 1));
  905. }
  906. const TA *const A;
  907. const TB *const B;
  908. TC *const C;
  909. const int64_t k;
  910. const int64_t lda;
  911. const int64_t ldb;
  912. const int64_t ldc;
  913. const int ith;
  914. const int nth;
  915. };
  916. #endif // __AVX__
  917. } // namespace
  918. /**
  919. * Performs optimized matrix multiplication on CPU.
  920. *
  921. * This subroutine may compute C = Aᵀ * B with column major ordering.
  922. * Despite its name, this isn't a generalized implementation. Work is
  923. * only performed when a handwritten kernel is written and available.
  924. * Otherwise the caller should fall back to a general matmul routine.
  925. *
  926. * For example, for single-threaded single-precision GEMM you can say
  927. *
  928. * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
  929. * 0, 1,
  930. * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
  931. *
  932. * @param m is rows in `A` and `C`
  933. * @param n is cols in `B` and `C`
  934. * @param k is cols in `A` and rows in `B`
  935. * @param A is first input matrix (always transposed)
  936. * @param lda is row stride of `A`
  937. * @param B is second input matrix (never transposed)
  938. * @param ldb is row stride of `B`
  939. * @param C is input/output array of output matrices
  940. * @param ldc is row stride of `C`
  941. * @param ith is thread id (must be less than `nth`)
  942. * @param nth is number of threads (must be greater than zero)
  943. * @param Atype is GGML data type of `A`
  944. * @param Btype is GGML data type of `B`
  945. * @param Ctype is GGML data type of `C`
  946. * @return true if this function was able to service the matmul request
  947. */
  948. bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
  949. int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
  950. assert(m >= 0);
  951. assert(n >= 0);
  952. assert(k >= 0);
  953. assert(lda >= k);
  954. assert(ldb >= k);
  955. assert(ldc >= m);
  956. assert(nth > 0);
  957. assert(ith < nth);
  958. // only enable sgemm for prompt processing
  959. if (n < 2)
  960. return false;
  961. if (Ctype != GGML_TYPE_F32)
  962. return false;
  963. switch (Atype) {
  964. case GGML_TYPE_F32: {
  965. if (Btype != GGML_TYPE_F32)
  966. return false;
  967. #if defined(__AVX512F__)
  968. if (k % 16)
  969. return false;
  970. tinyBLAS<16, __m512, __m512, float, float, float> tb{
  971. k, (const float *)A, lda,
  972. (const float *)B, ldb,
  973. (float *)C, ldc,
  974. ith, nth};
  975. tb.matmul(m, n);
  976. return true;
  977. #elif defined(__AVX__) || defined(__AVX2__)
  978. if (k % 8)
  979. return false;
  980. tinyBLAS<8, __m256, __m256, float, float, float> tb{
  981. k, (const float *)A, lda,
  982. (const float *)B, ldb,
  983. (float *)C, ldc,
  984. ith, nth};
  985. tb.matmul(m, n);
  986. return true;
  987. #elif defined(__ARM_NEON)
  988. if (n < 4)
  989. return false;
  990. if (k % 4)
  991. return false;
  992. tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
  993. k, (const float *)A, lda,
  994. (const float *)B, ldb,
  995. (float *)C, ldc,
  996. ith, nth};
  997. tb.matmul(m, n);
  998. return true;
  999. #else
  1000. return false;
  1001. #endif
  1002. }
  1003. case GGML_TYPE_F16: {
  1004. #if defined(__AVX512F__)
  1005. if (k % 16)
  1006. return false;
  1007. if (Btype != GGML_TYPE_F32)
  1008. return false;
  1009. tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
  1010. k, (const ggml_fp16_t *)A, lda,
  1011. (const float *)B, ldb,
  1012. (float *)C, ldc,
  1013. ith, nth};
  1014. tb.matmul(m, n);
  1015. return true;
  1016. #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
  1017. if (k % 8)
  1018. return false;
  1019. if (Btype != GGML_TYPE_F32)
  1020. return false;
  1021. tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
  1022. k, (const ggml_fp16_t *)A, lda,
  1023. (const float *)B, ldb,
  1024. (float *)C, ldc,
  1025. ith, nth};
  1026. tb.matmul(m, n);
  1027. return true;
  1028. #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  1029. if (n < 8)
  1030. return false;
  1031. if (k % 8)
  1032. return false;
  1033. if (Btype != GGML_TYPE_F16)
  1034. return false;
  1035. tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
  1036. k, (const ggml_fp16_t *)A, lda,
  1037. (const ggml_fp16_t *)B, ldb,
  1038. (float *)C, ldc,
  1039. ith, nth};
  1040. tb.matmul(m, n);
  1041. return true;
  1042. #elif defined(__ARM_NEON) && !defined(_MSC_VER)
  1043. if (k % 4)
  1044. return false;
  1045. if (Btype != GGML_TYPE_F32)
  1046. return false;
  1047. tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
  1048. k, (const ggml_fp16_t *)A, lda,
  1049. (const float *)B, ldb,
  1050. (float *)C, ldc,
  1051. ith, nth};
  1052. tb.matmul(m, n);
  1053. return true;
  1054. #else
  1055. return false;
  1056. #endif
  1057. }
  1058. case GGML_TYPE_Q8_0: {
  1059. if (Btype != GGML_TYPE_Q8_0)
  1060. return false;
  1061. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  1062. tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
  1063. k, (const block_q8_0 *)A, lda,
  1064. (const block_q8_0 *)B, ldb,
  1065. (float *)C, ldc,
  1066. ith, nth};
  1067. tb.matmul(m, n);
  1068. return true;
  1069. #elif defined(__ARM_FEATURE_DOTPROD)
  1070. tinyBLAS_Q0_ARM<block_q8_0> tb{
  1071. k, (const block_q8_0 *)A, lda,
  1072. (const block_q8_0 *)B, ldb,
  1073. (float *)C, ldc,
  1074. ith, nth};
  1075. tb.matmul(m, n);
  1076. return true;
  1077. #else
  1078. return false;
  1079. #endif
  1080. }
  1081. case GGML_TYPE_Q4_0: {
  1082. if (Btype != GGML_TYPE_Q8_0)
  1083. return false;
  1084. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  1085. tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
  1086. k, (const block_q4_0 *)A, lda,
  1087. (const block_q8_0 *)B, ldb,
  1088. (float *)C, ldc,
  1089. ith, nth};
  1090. tb.matmul(m, n);
  1091. return true;
  1092. #elif defined(__ARM_FEATURE_DOTPROD)
  1093. tinyBLAS_Q0_ARM<block_q4_0> tb{
  1094. k, (const block_q4_0 *)A, lda,
  1095. (const block_q8_0 *)B, ldb,
  1096. (float *)C, ldc,
  1097. ith, nth};
  1098. tb.matmul(m, n);
  1099. return true;
  1100. #else
  1101. return false;
  1102. #endif
  1103. }
  1104. case GGML_TYPE_IQ4_NL: {
  1105. if (Btype != GGML_TYPE_Q8_0)
  1106. return false;
  1107. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  1108. tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
  1109. k, (const block_iq4_nl *)A, lda,
  1110. (const block_q8_0 *)B, ldb,
  1111. (float *)C, ldc,
  1112. ith, nth};
  1113. tb.matmul(m, n);
  1114. return true;
  1115. #else
  1116. return false;
  1117. #endif
  1118. }
  1119. default:
  1120. return false;
  1121. }
  1122. (void)m;
  1123. (void)n;
  1124. (void)k;
  1125. (void)A;
  1126. (void)lda;
  1127. (void)B;
  1128. (void)ldb;
  1129. (void)C;
  1130. (void)ldc;
  1131. (void)ith;
  1132. (void)nth;
  1133. (void)Atype;
  1134. (void)Btype;
  1135. (void)Ctype;
  1136. }