sgemm.cpp 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030
  1. // Copyright 2024 Mozilla Foundation
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining
  4. // a copy of this software and associated documentation files (the
  5. // "Software"), to deal in the Software without restriction, including
  6. // without limitation the rights to use, copy, modify, merge, publish,
  7. // distribute, sublicense, and/or sell copies of the Software, and to
  8. // permit persons to whom the Software is furnished to do so, subject to
  9. // the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be
  12. // included in all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. // SOFTWARE.
  22. //
  23. // _ _ ___ _ _ ___
  24. // | |_(_)_ _ _ _| _ ) | /_\ / __|
  25. // | _| | ' \ || | _ \ |__ / _ \\__ \.
  26. // \__|_|_||_\_, |___/____/_/ \_\___/
  27. // |__/
  28. //
  29. // BASIC LINEAR ALGEBRA SUBPROGRAMS
  30. //
  31. //
  32. // This file implements multithreaded CPU matrix multiplication for the
  33. // common contiguous use case C = Aᵀ * B. These kernels are designed to
  34. // have excellent performance[1] for matrices that fit in the CPU cache
  35. // without imposing any overhead such as cache filling or malloc calls.
  36. //
  37. // This implementation does not guarantee any upper bound with rounding
  38. // errors, which grow along with k. Our goal's to maximally exploit the
  39. // hardware for performance, and then use whatever resources remain for
  40. // improving numerical accuracy.
  41. //
  42. // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
  43. // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
  44. #pragma GCC diagnostic ignored "-Wpedantic"
  45. #pragma GCC diagnostic ignored "-Wignored-attributes"
  46. #include "sgemm.h"
  47. #include "ggml-impl.h"
  48. #include "ggml-quants.h"
  49. #ifdef _MSC_VER
  50. #define NOINLINE __declspec(noinline)
  51. #else
  52. #define NOINLINE __attribute__((__noinline__))
  53. #endif
  54. #if defined(__ARM_NEON) || defined(__AVX512F__)
  55. #define VECTOR_REGISTERS 32
  56. #else
  57. #define VECTOR_REGISTERS 16
  58. #endif
  59. #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
  60. namespace {
  61. inline float unhalf(ggml_fp16_t d) {
  62. return GGML_FP16_TO_FP32(d);
  63. }
  64. ////////////////////////////////////////////////////////////////////////////////////////////////////
  65. // VECTORIZED ARITHMETIC OPERATIONS
  66. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  67. inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
  68. inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
  69. inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
  70. #endif // __SSE__
  71. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  72. inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
  73. inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
  74. inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
  75. #endif // __AVX__
  76. #if defined(__AVX512F__)
  77. inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
  78. inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
  79. inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
  80. #endif // __AVX512F__
  81. #if defined(__ARM_NEON)
  82. inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
  83. inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
  84. inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
  85. #endif // __ARM_NEON
  86. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  87. inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
  88. inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
  89. inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
  90. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  91. ////////////////////////////////////////////////////////////////////////////////////////////////////
  92. // VECTORIZED FUSED MULTIPLY ADD
  93. /**
  94. * Computes a * b + c.
  95. */
  96. template <typename T, typename U>
  97. inline U madd(T a, T b, U c) {
  98. return add(mul(a, b), c);
  99. }
  100. #if defined(__FMA__)
  101. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  102. template <>
  103. inline __m256 madd(__m256 a, __m256 b, __m256 c) {
  104. return _mm256_fmadd_ps(a, b, c);
  105. }
  106. #endif
  107. #if defined(__AVX512F__)
  108. template <>
  109. inline __m512 madd(__m512 a, __m512 b, __m512 c) {
  110. return _mm512_fmadd_ps(a, b, c);
  111. }
  112. #endif
  113. #endif
  114. #if defined(__ARM_FEATURE_FMA)
  115. template <>
  116. inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
  117. return vfmaq_f32(c, b, a);
  118. }
  119. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  120. template <>
  121. inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
  122. return vfmaq_f16(c, b, a);
  123. }
  124. #endif
  125. #endif
  126. ////////////////////////////////////////////////////////////////////////////////////////////////////
  127. // VECTORIZED HORIZONTAL SUM
  128. #if defined(__ARM_NEON)
  129. inline float hsum(float32x4_t x) {
  130. return vaddvq_f32(x);
  131. }
  132. #endif // __ARM_NEON
  133. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  134. inline float hsum(float16x8_t x) {
  135. return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
  136. vcvt_f32_f16(vget_high_f16(x))));
  137. }
  138. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  139. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  140. inline float hsum(__m128 x) {
  141. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  142. x = _mm_add_ps(x, _mm_movehl_ps(x, x));
  143. x = _mm_add_ss(x, _mm_movehdup_ps(x));
  144. #else
  145. __m128 t;
  146. t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
  147. x = _mm_add_ps(x, t);
  148. t = _mm_movehl_ps(t, x);
  149. x = _mm_add_ss(x, t);
  150. #endif
  151. return _mm_cvtss_f32(x);
  152. }
  153. #endif
  154. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  155. inline float hsum(__m256 x) {
  156. return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
  157. _mm256_castps256_ps128(x)));
  158. }
  159. #endif // __AVX__
  160. #if defined(__AVX512F__)
  161. inline float hsum(__m512 x) {
  162. return _mm512_reduce_add_ps(x);
  163. }
  164. #endif // __AVX512F__
  165. ////////////////////////////////////////////////////////////////////////////////////////////////////
  166. // VECTORIZED MEMORY LOADING
  167. template <typename T, typename U> T load(const U *);
  168. #if defined(__ARM_NEON)
  169. template <> inline float32x4_t load(const float *p) {
  170. return vld1q_f32(p);
  171. }
  172. #if !defined(_MSC_VER)
  173. template <> inline float16x8_t load(const ggml_fp16_t *p) {
  174. return vld1q_f16((const float16_t *)p);
  175. }
  176. template <> inline float32x4_t load(const ggml_fp16_t *p) {
  177. return vcvt_f32_f16(vld1_f16((const float16_t *)p));
  178. }
  179. #endif // _MSC_VER
  180. #endif // __ARM_NEON
  181. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  182. template <> inline __m128 load(const float *p) {
  183. return _mm_loadu_ps(p);
  184. }
  185. #endif // __SSE__
  186. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  187. template <> inline __m256 load(const float *p) {
  188. return _mm256_loadu_ps(p);
  189. }
  190. #endif // __AVX__
  191. #if defined(__F16C__)
  192. template <> inline __m256 load(const ggml_fp16_t *p) {
  193. return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
  194. }
  195. #endif // __F16C__
  196. #if defined(__AVX512F__)
  197. template <> inline __m512 load(const float *p) {
  198. return _mm512_loadu_ps(p);
  199. }
  200. template <> inline __m512 load(const ggml_fp16_t *p) {
  201. return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
  202. }
  203. #endif // __AVX512F__
  204. ////////////////////////////////////////////////////////////////////////////////////////////////////
  205. // FLOATING POINT MATRIX MULTIPLICATION
  206. template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
  207. class tinyBLAS {
  208. public:
  209. tinyBLAS(int64_t k,
  210. const TA *A, int64_t lda,
  211. const TB *B, int64_t ldb,
  212. TC *C, int64_t ldc,
  213. int ith, int nth)
  214. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  215. }
  216. void matmul(int64_t m, int64_t n, int task) {
  217. if (task == GGML_TASK_TYPE_COMPUTE)
  218. mnpack(0, m, 0, n);
  219. }
  220. private:
  221. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  222. int64_t mc, nc, mp, np;
  223. switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
  224. #if VECTOR_REGISTERS == 32
  225. case 0x55:
  226. mc = 5;
  227. nc = 5;
  228. gemm<5, 5>(m0, m, n0, n);
  229. break;
  230. case 0x45:
  231. mc = 4;
  232. nc = 5;
  233. gemm<4, 5>(m0, m, n0, n);
  234. break;
  235. case 0x54:
  236. mc = 5;
  237. nc = 4;
  238. gemm<5, 4>(m0, m, n0, n);
  239. break;
  240. case 0x44:
  241. mc = 4;
  242. nc = 4;
  243. gemm<4, 4>(m0, m, n0, n);
  244. break;
  245. case 0x53:
  246. mc = 5;
  247. nc = 3;
  248. gemm<5, 3>(m0, m, n0, n);
  249. break;
  250. case 0x35:
  251. mc = 3;
  252. nc = 5;
  253. gemm<3, 5>(m0, m, n0, n);
  254. break;
  255. case 0x43:
  256. mc = 4;
  257. nc = 3;
  258. gemm<4, 3>(m0, m, n0, n);
  259. break;
  260. #else
  261. case 0x55:
  262. case 0x54:
  263. case 0x53:
  264. case 0x45:
  265. case 0x44:
  266. case 0x43:
  267. mc = 4;
  268. nc = 3;
  269. gemm<4, 3>(m0, m, n0, n);
  270. break;
  271. case 0x35:
  272. #endif
  273. case 0x34:
  274. mc = 3;
  275. nc = 4;
  276. gemm<3, 4>(m0, m, n0, n);
  277. break;
  278. case 0x52:
  279. mc = 5;
  280. nc = 2;
  281. gemm<5, 2>(m0, m, n0, n);
  282. break;
  283. case 0x33:
  284. mc = 3;
  285. nc = 3;
  286. gemm<3, 3>(m0, m, n0, n);
  287. break;
  288. case 0x25:
  289. mc = 2;
  290. nc = 5;
  291. gemm<2, 5>(m0, m, n0, n);
  292. break;
  293. case 0x42:
  294. mc = 4;
  295. nc = 2;
  296. gemm<4, 2>(m0, m, n0, n);
  297. break;
  298. case 0x24:
  299. mc = 2;
  300. nc = 4;
  301. gemm<2, 4>(m0, m, n0, n);
  302. break;
  303. case 0x32:
  304. mc = 3;
  305. nc = 2;
  306. gemm<3, 2>(m0, m, n0, n);
  307. break;
  308. case 0x23:
  309. mc = 2;
  310. nc = 3;
  311. gemm<2, 3>(m0, m, n0, n);
  312. break;
  313. case 0x51:
  314. mc = 5;
  315. nc = 1;
  316. gemm<5, 1>(m0, m, n0, n);
  317. break;
  318. case 0x41:
  319. mc = 4;
  320. nc = 1;
  321. gemm<4, 1>(m0, m, n0, n);
  322. break;
  323. case 0x22:
  324. mc = 2;
  325. nc = 2;
  326. gemm<2, 2>(m0, m, n0, n);
  327. break;
  328. case 0x15:
  329. mc = 1;
  330. nc = 5;
  331. gemm<1, 5>(m0, m, n0, n);
  332. break;
  333. case 0x14:
  334. mc = 1;
  335. nc = 4;
  336. gemm<1, 4>(m0, m, n0, n);
  337. break;
  338. case 0x31:
  339. mc = 3;
  340. nc = 1;
  341. gemm<3, 1>(m0, m, n0, n);
  342. break;
  343. case 0x13:
  344. mc = 1;
  345. nc = 3;
  346. gemm<1, 3>(m0, m, n0, n);
  347. break;
  348. case 0x21:
  349. mc = 2;
  350. nc = 1;
  351. gemm<2, 1>(m0, m, n0, n);
  352. break;
  353. case 0x12:
  354. mc = 1;
  355. nc = 2;
  356. gemm<1, 2>(m0, m, n0, n);
  357. break;
  358. case 0x11:
  359. mc = 1;
  360. nc = 1;
  361. gemm<1, 1>(m0, m, n0, n);
  362. break;
  363. default:
  364. return;
  365. }
  366. mp = m0 + (m - m0) / mc * mc;
  367. np = n0 + (n - n0) / nc * nc;
  368. mnpack(mp, m, n0, np);
  369. mnpack(m0, m, np, n);
  370. }
  371. template <int RM, int RN>
  372. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  373. int64_t ytiles = (m - m0) / RM;
  374. int64_t xtiles = (n - n0) / RN;
  375. int64_t tiles = xtiles * ytiles;
  376. int64_t duty = (tiles + nth - 1) / nth;
  377. int64_t start = duty * ith;
  378. int64_t end = start + duty;
  379. if (end > tiles)
  380. end = tiles;
  381. for (int64_t job = start; job < end; ++job) {
  382. int64_t ii = m0 + job / xtiles * RM;
  383. int64_t jj = n0 + job % xtiles * RN;
  384. D Cv[RN][RM] = {};
  385. for (int64_t l = 0; l < k; l += KN)
  386. for (int64_t j = 0; j < RN; ++j)
  387. for (int64_t i = 0; i < RM; ++i)
  388. Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
  389. load<V>(B + ldb * (jj + j) + l),
  390. Cv[j][i]);
  391. for (int64_t j = 0; j < RN; ++j)
  392. for (int64_t i = 0; i < RM; ++i)
  393. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  394. }
  395. }
  396. const TA *const A;
  397. const TB *const B;
  398. TC *const C;
  399. const int64_t k;
  400. const int64_t lda;
  401. const int64_t ldb;
  402. const int64_t ldc;
  403. const int ith;
  404. const int nth;
  405. };
  406. //////////////////////////////////////////////////////////////////////////////////////////
  407. // QUANT ZERO MATRIX MULTIPLICATION
  408. #if defined(__ARM_FEATURE_DOTPROD)
  409. template <typename TA>
  410. class tinyBLAS_Q0_ARM {
  411. public:
  412. tinyBLAS_Q0_ARM(int64_t k,
  413. const TA *A, int64_t lda,
  414. const block_q8_0 *B, int64_t ldb,
  415. float *C, int64_t ldc,
  416. int ith, int nth)
  417. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  418. }
  419. void matmul(int64_t m, int64_t n, int task) {
  420. if (task == GGML_TASK_TYPE_COMPUTE)
  421. mnpack(0, m, 0, n);
  422. }
  423. private:
  424. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  425. int64_t mc, nc, mp, np;
  426. switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
  427. case 0x33:
  428. mc = 3;
  429. nc = 3;
  430. gemm<3, 3>(m0, m, n0, n);
  431. break;
  432. case 0x32:
  433. mc = 3;
  434. nc = 2;
  435. gemm<3, 2>(m0, m, n0, n);
  436. break;
  437. case 0x23:
  438. mc = 2;
  439. nc = 3;
  440. gemm<2, 3>(m0, m, n0, n);
  441. break;
  442. case 0x22:
  443. mc = 2;
  444. nc = 2;
  445. gemm<2, 2>(m0, m, n0, n);
  446. break;
  447. case 0x31:
  448. mc = 3;
  449. nc = 1;
  450. gemm<3, 1>(m0, m, n0, n);
  451. break;
  452. case 0x13:
  453. mc = 1;
  454. nc = 3;
  455. gemm<1, 3>(m0, m, n0, n);
  456. break;
  457. case 0x21:
  458. mc = 2;
  459. nc = 1;
  460. gemm<2, 1>(m0, m, n0, n);
  461. break;
  462. case 0x12:
  463. mc = 1;
  464. nc = 2;
  465. gemm<1, 2>(m0, m, n0, n);
  466. break;
  467. case 0x11:
  468. mc = 1;
  469. nc = 1;
  470. gemm<1, 1>(m0, m, n0, n);
  471. break;
  472. default:
  473. return;
  474. }
  475. mp = m0 + (m - m0) / mc * mc;
  476. np = n0 + (n - n0) / nc * nc;
  477. mnpack(mp, m, n0, np);
  478. mnpack(m0, m, np, n);
  479. }
  480. template <int RM, int RN>
  481. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  482. int64_t ytiles = (m - m0) / RM;
  483. int64_t xtiles = (n - n0) / RN;
  484. int64_t tiles = xtiles * ytiles;
  485. int64_t duty = (tiles + nth - 1) / nth;
  486. int64_t start = duty * ith;
  487. int64_t end = start + duty;
  488. if (end > tiles)
  489. end = tiles;
  490. for (int64_t job = start; job < end; ++job) {
  491. int64_t ii = m0 + job / xtiles * RM;
  492. int64_t jj = n0 + job % xtiles * RN;
  493. float32x4_t Cv[RN][RM] = {};
  494. for (int64_t l = 0; l < k; ++l)
  495. for (int64_t j = 0; j < RN; ++j)
  496. for (int64_t i = 0; i < RM; ++i)
  497. Cv[j][i] = vmlaq_n_f32(Cv[j][i],
  498. vcvtq_f32_s32(vdotq_s32(
  499. vdotq_s32(vdupq_n_s32(0),
  500. load_lo(A + lda * (ii + i) + l),
  501. load_lo(B + ldb * (jj + j) + l)),
  502. load_hi(A + lda * (ii + i) + l),
  503. load_hi(B + ldb * (jj + j) + l))),
  504. unhalf(A[lda * (ii + i) + l].d) *
  505. unhalf(B[ldb * (jj + j) + l].d));
  506. for (int64_t j = 0; j < RN; ++j)
  507. for (int64_t i = 0; i < RM; ++i)
  508. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  509. }
  510. }
  511. inline int8x16_t load_lo(const block_q8_0 *b) {
  512. return vld1q_s8(b->qs);
  513. }
  514. inline int8x16_t load_hi(const block_q8_0 *b) {
  515. return vld1q_s8(b->qs + 16);
  516. }
  517. inline int8x16_t load_lo(const block_q4_0 *b) {
  518. return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
  519. vdupq_n_u8(0x0f))),
  520. vdupq_n_s8(0x8));
  521. }
  522. inline int8x16_t load_hi(const block_q4_0 *b) {
  523. return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
  524. vdupq_n_s8(0x8));
  525. }
  526. const TA *const A;
  527. const block_q8_0 *const B;
  528. float *const C;
  529. const int64_t k;
  530. const int64_t lda;
  531. const int64_t ldb;
  532. const int64_t ldc;
  533. const int ith;
  534. const int nth;
  535. };
  536. #endif // __ARM_FEATURE_DOTPROD
  537. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  538. template <typename TA, typename TB, typename TC>
  539. class tinyBLAS_Q0_AVX {
  540. public:
  541. tinyBLAS_Q0_AVX(int64_t k,
  542. const TA *A, int64_t lda,
  543. const TB *B, int64_t ldb,
  544. TC *C, int64_t ldc,
  545. int ith, int nth)
  546. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  547. }
  548. void matmul(int64_t m, int64_t n, int task) {
  549. if (task == GGML_TASK_TYPE_COMPUTE)
  550. mnpack(0, m, 0, n);
  551. }
  552. private:
  553. void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  554. int64_t mc, nc, mp, np;
  555. switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
  556. #if VECTOR_REGISTERS == 32
  557. case 0x44:
  558. mc = 4;
  559. nc = 4;
  560. gemm<4, 4>(m0, m, n0, n);
  561. break;
  562. case 0x43:
  563. mc = 4;
  564. nc = 3;
  565. gemm<4, 3>(m0, m, n0, n);
  566. break;
  567. case 0x34:
  568. mc = 3;
  569. nc = 4;
  570. gemm<3, 4>(m0, m, n0, n);
  571. break;
  572. case 0x33:
  573. mc = 3;
  574. nc = 3;
  575. gemm<3, 3>(m0, m, n0, n);
  576. break;
  577. case 0x42:
  578. mc = 4;
  579. nc = 2;
  580. gemm<4, 2>(m0, m, n0, n);
  581. break;
  582. case 0x24:
  583. mc = 2;
  584. nc = 4;
  585. gemm<2, 4>(m0, m, n0, n);
  586. break;
  587. #else
  588. case 0x44:
  589. case 0x43:
  590. case 0x42:
  591. mc = 4;
  592. nc = 2;
  593. gemm<4, 2>(m0, m, n0, n);
  594. break;
  595. case 0x34:
  596. case 0x24:
  597. mc = 2;
  598. nc = 4;
  599. gemm<2, 4>(m0, m, n0, n);
  600. break;
  601. case 0x33:
  602. #endif
  603. case 0x32:
  604. mc = 3;
  605. nc = 2;
  606. gemm<3, 2>(m0, m, n0, n);
  607. break;
  608. case 0x23:
  609. mc = 2;
  610. nc = 3;
  611. gemm<2, 3>(m0, m, n0, n);
  612. break;
  613. case 0x41:
  614. mc = 4;
  615. nc = 1;
  616. gemm<4, 1>(m0, m, n0, n);
  617. break;
  618. case 0x22:
  619. mc = 2;
  620. nc = 2;
  621. gemm<2, 2>(m0, m, n0, n);
  622. break;
  623. case 0x14:
  624. mc = 1;
  625. nc = 4;
  626. gemm<1, 4>(m0, m, n0, n);
  627. break;
  628. case 0x31:
  629. mc = 3;
  630. nc = 1;
  631. gemm<3, 1>(m0, m, n0, n);
  632. break;
  633. case 0x13:
  634. mc = 1;
  635. nc = 3;
  636. gemm<1, 3>(m0, m, n0, n);
  637. break;
  638. case 0x21:
  639. mc = 2;
  640. nc = 1;
  641. gemm<2, 1>(m0, m, n0, n);
  642. break;
  643. case 0x12:
  644. mc = 1;
  645. nc = 2;
  646. gemm<1, 2>(m0, m, n0, n);
  647. break;
  648. case 0x11:
  649. mc = 1;
  650. nc = 1;
  651. gemm<1, 1>(m0, m, n0, n);
  652. break;
  653. default:
  654. return;
  655. }
  656. mp = m0 + (m - m0) / mc * mc;
  657. np = n0 + (n - n0) / nc * nc;
  658. mnpack(mp, m, n0, np);
  659. mnpack(m0, m, np, n);
  660. }
  661. template <int RM, int RN>
  662. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  663. int64_t ytiles = (m - m0) / RM;
  664. int64_t xtiles = (n - n0) / RN;
  665. int64_t tiles = xtiles * ytiles;
  666. int64_t duty = (tiles + nth - 1) / nth;
  667. int64_t start = duty * ith;
  668. int64_t end = start + duty;
  669. if (end > tiles)
  670. end = tiles;
  671. for (int64_t job = start; job < end; ++job) {
  672. int64_t ii = m0 + job / xtiles * RM;
  673. int64_t jj = n0 + job % xtiles * RN;
  674. __m256 Cv[RN][RM] = {};
  675. for (int64_t l = 0; l < k; ++l)
  676. for (int64_t j = 0; j < RN; ++j)
  677. for (int64_t i = 0; i < RM; ++i) {
  678. #if defined(__AVX2__)
  679. __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  680. load(A + lda * (ii + i) + l)),
  681. _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
  682. load(A + lda * (ii + i) + l)));
  683. #else
  684. __m128i ali0 = load0(A + lda * (ii + i) + l);
  685. __m128i ali1 = load1(A + lda * (ii + i) + l);
  686. __m128i blj0 = load0(B + ldb * (jj + j) + l);
  687. __m128i blj1 = load1(B + ldb * (jj + j) + l);
  688. __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
  689. __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
  690. __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
  691. __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
  692. // updot
  693. const __m128i oneFill = _mm_set1_epi16(1);
  694. __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
  695. __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
  696. __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
  697. #endif
  698. Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
  699. unhalf(B[ldb * (jj + j) + l].d)),
  700. udTmp,
  701. Cv[j][i]);
  702. }
  703. for (int64_t j = 0; j < RN; ++j)
  704. for (int64_t i = 0; i < RM; ++i)
  705. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  706. }
  707. }
  708. inline __m256i load(const block_q8_0 *b) {
  709. return _mm256_loadu_si256((const __m256i *)b->qs);
  710. }
  711. inline __m128i load0(const block_q8_0 *b) {
  712. return _mm_loadu_si128((const __m128i *)b->qs);
  713. }
  714. inline __m128i load1(const block_q8_0 *b) {
  715. return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
  716. }
  717. inline __m256i load(const block_q4_0 *b) {
  718. return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
  719. }
  720. inline __m128i load0(const block_q4_0 *b) {
  721. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  722. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
  723. }
  724. inline __m128i load1(const block_q4_0 *b) {
  725. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  726. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
  727. }
  728. inline __m256 updot(__m256i u, __m256i s) {
  729. __m256i res;
  730. #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
  731. res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
  732. #else
  733. res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
  734. #endif
  735. return _mm256_cvtepi32_ps(res);
  736. }
  737. static inline __m256i denibble(const uint8_t *p) {
  738. __m128i x = _mm_loadu_si128((const __m128i *)p);
  739. return _mm256_and_si256(_mm256_set1_epi8(15),
  740. _mm256_insertf128_si256(_mm256_castsi128_si256(x),
  741. _mm_srli_epi16(x, 4), 1));
  742. }
  743. const TA *const A;
  744. const TB *const B;
  745. TC *const C;
  746. const int64_t k;
  747. const int64_t lda;
  748. const int64_t ldb;
  749. const int64_t ldc;
  750. const int ith;
  751. const int nth;
  752. };
  753. #endif // __AVX__
  754. } // namespace
  755. /**
  756. * Performs optimized matrix multiplication on CPU.
  757. *
  758. * This subroutine may compute C = Aᵀ * B with column major ordering.
  759. * Despite its name, this isn't a generalized implementation. Work is
  760. * only performed when a handwritten kernel is written and available.
  761. * Otherwise the caller should fall back to a general matmul routine.
  762. *
  763. * For example, for single-threaded single-precision GEMM you can say
  764. *
  765. * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
  766. * 0, 1, GGML_TASK_TYPE_COMPUTE,
  767. * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
  768. *
  769. * @param m is rows in `A` and `C`
  770. * @param n is cols in `B` and `C`
  771. * @param k is cols in `A` and rows in `B`
  772. * @param A is first input matrix (always transposed)
  773. * @param lda is row stride of `A`
  774. * @param B is second input matrix (never transposed)
  775. * @param ldb is row stride of `B`
  776. * @param C is input/output array of output matrices
  777. * @param ldc is row stride of `C`
  778. * @param ith is thread id (must be less than `nth`)
  779. * @param nth is number of threads (must be greater than zero)
  780. * @param task is GGML task type
  781. * @param Atype is GGML data type of `A`
  782. * @param Btype is GGML data type of `B`
  783. * @param Ctype is GGML data type of `C`
  784. * @return true if this function was able to service the matmul request
  785. */
  786. bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
  787. int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
  788. assert(m >= 0);
  789. assert(n >= 0);
  790. assert(k >= 0);
  791. assert(lda >= k);
  792. assert(ldb >= k);
  793. assert(ldc >= m);
  794. assert(nth > 0);
  795. assert(ith < nth);
  796. if (Ctype != GGML_TYPE_F32)
  797. return false;
  798. switch (Atype) {
  799. case GGML_TYPE_F32: {
  800. if (Btype != GGML_TYPE_F32)
  801. return false;
  802. #if defined(__AVX512F__)
  803. if (k % 16)
  804. return false;
  805. tinyBLAS<16, __m512, __m512, float, float, float> tb{
  806. k, (const float *)A, lda,
  807. (const float *)B, ldb,
  808. (float *)C, ldc,
  809. ith, nth};
  810. tb.matmul(m, n, task);
  811. return true;
  812. #elif defined(__AVX__) || defined(__AVX2__)
  813. if (k % 8)
  814. return false;
  815. tinyBLAS<8, __m256, __m256, float, float, float> tb{
  816. k, (const float *)A, lda,
  817. (const float *)B, ldb,
  818. (float *)C, ldc,
  819. ith, nth};
  820. tb.matmul(m, n, task);
  821. return true;
  822. #elif defined(__ARM_NEON)
  823. if (n < 4)
  824. return false;
  825. if (k % 4)
  826. return false;
  827. tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
  828. k, (const float *)A, lda,
  829. (const float *)B, ldb,
  830. (float *)C, ldc,
  831. ith, nth};
  832. tb.matmul(m, n, task);
  833. return true;
  834. #else
  835. return false;
  836. #endif
  837. }
  838. case GGML_TYPE_F16: {
  839. #if defined(__AVX512F__)
  840. if (k % 16)
  841. return false;
  842. if (Btype != GGML_TYPE_F32)
  843. return false;
  844. tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
  845. k, (const ggml_fp16_t *)A, lda,
  846. (const float *)B, ldb,
  847. (float *)C, ldc,
  848. ith, nth};
  849. tb.matmul(m, n, task);
  850. return true;
  851. #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
  852. if (k % 8)
  853. return false;
  854. if (Btype != GGML_TYPE_F32)
  855. return false;
  856. tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
  857. k, (const ggml_fp16_t *)A, lda,
  858. (const float *)B, ldb,
  859. (float *)C, ldc,
  860. ith, nth};
  861. tb.matmul(m, n, task);
  862. return true;
  863. #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  864. if (n < 8)
  865. return false;
  866. if (k % 8)
  867. return false;
  868. if (Btype != GGML_TYPE_F16)
  869. return false;
  870. tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
  871. k, (const ggml_fp16_t *)A, lda,
  872. (const ggml_fp16_t *)B, ldb,
  873. (float *)C, ldc,
  874. ith, nth};
  875. tb.matmul(m, n, task);
  876. return true;
  877. #elif defined(__ARM_NEON) && !defined(_MSC_VER)
  878. if (k % 4)
  879. return false;
  880. if (Btype != GGML_TYPE_F32)
  881. return false;
  882. tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
  883. k, (const ggml_fp16_t *)A, lda,
  884. (const float *)B, ldb,
  885. (float *)C, ldc,
  886. ith, nth};
  887. tb.matmul(m, n, task);
  888. return true;
  889. #else
  890. return false;
  891. #endif
  892. }
  893. case GGML_TYPE_Q8_0: {
  894. if (Btype != GGML_TYPE_Q8_0)
  895. return false;
  896. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  897. tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
  898. k, (const block_q8_0 *)A, lda,
  899. (const block_q8_0 *)B, ldb,
  900. (float *)C, ldc,
  901. ith, nth};
  902. tb.matmul(m, n, task);
  903. return true;
  904. #elif defined(__ARM_FEATURE_DOTPROD)
  905. tinyBLAS_Q0_ARM<block_q8_0> tb{
  906. k, (const block_q8_0 *)A, lda,
  907. (const block_q8_0 *)B, ldb,
  908. (float *)C, ldc,
  909. ith, nth};
  910. tb.matmul(m, n, task);
  911. return true;
  912. #else
  913. return false;
  914. #endif
  915. }
  916. case GGML_TYPE_Q4_0: {
  917. if (Btype != GGML_TYPE_Q8_0)
  918. return false;
  919. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  920. tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
  921. k, (const block_q4_0 *)A, lda,
  922. (const block_q8_0 *)B, ldb,
  923. (float *)C, ldc,
  924. ith, nth};
  925. tb.matmul(m, n, task);
  926. return true;
  927. #elif defined(__ARM_FEATURE_DOTPROD)
  928. tinyBLAS_Q0_ARM<block_q4_0> tb{
  929. k, (const block_q4_0 *)A, lda,
  930. (const block_q8_0 *)B, ldb,
  931. (float *)C, ldc,
  932. ith, nth};
  933. tb.matmul(m, n, task);
  934. return true;
  935. #else
  936. return false;
  937. #endif
  938. }
  939. default:
  940. return false;
  941. }
  942. (void)m;
  943. (void)n;
  944. (void)k;
  945. (void)A;
  946. (void)lda;
  947. (void)B;
  948. (void)ldb;
  949. (void)C;
  950. (void)ldc;
  951. (void)ith;
  952. (void)nth;
  953. (void)task;
  954. (void)Atype;
  955. (void)Btype;
  956. (void)Ctype;
  957. }