vecdotq.cuh 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159
  1. /**
  2. * llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "common.cuh"
  27. #include <cstdint>
  28. static __device__ __forceinline__ int get_int_b2(const void * x, const int & i32) {
  29. const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
  30. int x32 = x16[2*i32 + 0] << 0;
  31. x32 |= x16[2*i32 + 1] << 16;
  32. return x32;
  33. }
  34. static __device__ __forceinline__ int get_int_b4(const void * x, const int & i32) {
  35. return ((const int *) x)[i32]; // assume at least 4 byte alignment
  36. }
  37. // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
  38. // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
  39. #define VDR_Q4_0_Q8_1_MMVQ 2
  40. #define VDR_Q4_0_Q8_1_MMQ 4
  41. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
  42. const int * v, const int * u, const float & d4, const half2 & ds8) {
  43. int sumi = 0;
  44. #pragma unroll
  45. for (int i = 0; i < vdr; ++i) {
  46. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  47. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  48. // SIMD dot product of quantized values
  49. sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
  50. sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
  51. }
  52. const float2 ds8f = __half22float2(ds8);
  53. // second part effectively subtracts 8 from each quant value
  54. return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
  55. }
  56. #define VDR_Q4_1_Q8_1_MMVQ 2
  57. #define VDR_Q4_1_Q8_1_MMQ 4
  58. template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
  59. const int * v, const int * u, const half2 & dm4, const half2 & ds8) {
  60. int sumi = 0;
  61. #pragma unroll
  62. for (int i = 0; i < vdr; ++i) {
  63. const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
  64. const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
  65. // SIMD dot product of quantized values
  66. sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi);
  67. sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi);
  68. }
  69. #ifdef GGML_CUDA_F16
  70. const float2 tmp = __half22float2(__hmul2(dm4, ds8));
  71. const float d4d8 = tmp.x;
  72. const float m4s8 = tmp.y;
  73. #else
  74. const float2 dm4f = __half22float2(dm4);
  75. const float2 ds8f = __half22float2(ds8);
  76. const float d4d8 = dm4f.x * ds8f.x;
  77. const float m4s8 = dm4f.y * ds8f.y;
  78. #endif // GGML_CUDA_F16
  79. // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
  80. return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
  81. }
  82. #define VDR_Q5_0_Q8_1_MMVQ 2
  83. #define VDR_Q5_0_Q8_1_MMQ 4
  84. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
  85. const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) {
  86. int sumi = 0;
  87. #pragma unroll
  88. for (int i = 0; i < vdr; ++i) {
  89. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  90. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  91. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  92. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  93. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  94. sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  95. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  96. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  97. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  98. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  99. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  100. sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  101. }
  102. const float2 ds8f = __half22float2(ds8);
  103. // second part effectively subtracts 16 from each quant value
  104. return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
  105. }
  106. #define VDR_Q5_1_Q8_1_MMVQ 2
  107. #define VDR_Q5_1_Q8_1_MMQ 4
  108. template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
  109. const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) {
  110. int sumi = 0;
  111. #pragma unroll
  112. for (int i = 0; i < vdr; ++i) {
  113. int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
  114. vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4
  115. vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
  116. vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
  117. vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
  118. sumi = ggml_cuda_dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
  119. int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
  120. vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
  121. vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
  122. vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
  123. vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
  124. sumi = ggml_cuda_dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
  125. }
  126. #ifdef GGML_CUDA_F16
  127. const float2 tmp = __half22float2(__hmul2(dm5, ds8));
  128. const float d5d8 = tmp.x;
  129. const float m5s8 = tmp.y;
  130. #else
  131. const float2 dm5f = __half22float2(dm5);
  132. const float2 ds8f = __half22float2(ds8);
  133. const float d5d8 = dm5f.x * ds8f.x;
  134. const float m5s8 = dm5f.y * ds8f.y;
  135. #endif // GGML_CUDA_F16
  136. // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
  137. return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
  138. }
  139. #define VDR_Q8_0_Q8_1_MMVQ 2
  140. #define VDR_Q8_0_Q8_1_MMQ 8
  141. template <typename T, int vdr> static __device__ __forceinline__ T vec_dot_q8_0_q8_1_impl(
  142. const int * v, const int * u, const T & d8_0, const T & d8_1) {
  143. int sumi = 0;
  144. #pragma unroll
  145. for (int i = 0; i < vdr; ++i) {
  146. // SIMD dot product of quantized values
  147. sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
  148. }
  149. return d8_0*d8_1 * ((T) sumi);
  150. }
  151. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl(
  152. const int * v, const int * u, const half2 & dm8, const half2 & ds8) {
  153. int sumi = 0;
  154. #pragma unroll
  155. for (int i = 0; i < vdr; ++i) {
  156. // SIMD dot product of quantized values
  157. sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
  158. }
  159. #ifdef GGML_CUDA_F16
  160. const float2 tmp = __half22float2(__hmul2(dm8, ds8));
  161. const float d8d8 = tmp.x;
  162. const float m8s8 = tmp.y;
  163. #else
  164. const float2 dm8f = __half22float2(dm8);
  165. const float2 ds8f = __half22float2(ds8);
  166. const float d8d8 = dm8f.x * ds8f.x;
  167. const float m8s8 = dm8f.y * ds8f.y;
  168. #endif // GGML_CUDA_F16
  169. // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
  170. return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
  171. }
  172. template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_16_q8_1_impl(
  173. const int * v, const int * u, const float * d8_0, const float & d8_1) {
  174. float sumf = 0.0f;
  175. #pragma unroll
  176. for (int i0 = 0; i0 < vdr; i0 += QI8_0/2) {
  177. int sumi = 0;
  178. #pragma unroll
  179. for (int i = i0; i < i0 + QI8_0/2; ++i) {
  180. // SIMD dot product of quantized values
  181. sumi = ggml_cuda_dp4a(v[i], u[i], sumi);
  182. }
  183. sumf += d8_0[i0/(QI8_0/2)]*sumi;
  184. }
  185. return d8_1*sumf;
  186. }
  187. #define VDR_Q2_K_Q8_1_MMVQ 1
  188. #define VDR_Q2_K_Q8_1_MMQ 4
  189. // contiguous v/x values
  190. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
  191. const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  192. const half2 & dm2, const float * __restrict__ d8) {
  193. float sumf_d = 0.0f;
  194. float sumf_m = 0.0f;
  195. #pragma unroll
  196. for (int i = 0; i < QR2_K; ++i) {
  197. const int sc = scales[2*i];
  198. const int vi = (v >> (2*i)) & 0x03030303;
  199. sumf_d += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
  200. // fill int with 4x m
  201. int m = sc >> 4;
  202. m |= m << 8;
  203. m |= m << 16;
  204. sumf_m += d8[i] * ggml_cuda_dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
  205. }
  206. const float2 dm2f = __half22float2(dm2);
  207. return dm2f.x*sumf_d - dm2f.y*sumf_m;
  208. }
  209. // contiguous v/x + u/y values
  210. template <int ns8>
  211. static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
  212. const int * __restrict__ v, const int * __restrict__ u, const half2 * dm2, const float & d8, const half2 * s8) {
  213. float sumf = 0.0f;
  214. float sumf_d8 = 0.0f;
  215. #pragma unroll
  216. for (int i0 = 0; i0 < QR2_K*VDR_Q2_K_Q8_1_MMQ; i0 += QI8_1) {
  217. const float2 dm2f0 = __half22float2(dm2[i0/(QI8_1/2) + 0]);
  218. int sumi_d0 = 0;
  219. const float2 dm2f1 = __half22float2(dm2[i0/(QI8_1/2) + 1]);
  220. int sumi_d1 = 0;
  221. #pragma unroll
  222. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  223. sumi_d0 = ggml_cuda_dp4a(v[i], u[i], sumi_d0);
  224. }
  225. sumf_d8 += dm2f0.x * sumi_d0;
  226. #pragma unroll
  227. for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
  228. sumi_d1 = ggml_cuda_dp4a(v[i], u[i], sumi_d1);
  229. }
  230. sumf_d8 += dm2f1.x * sumi_d1;
  231. if (i0/QI8_1 < ns8) {
  232. const float2 s8f = __half22float2(s8[i0/QI8_1]);
  233. sumf -= dm2f0.y*s8f.x;
  234. sumf -= dm2f1.y*s8f.y;
  235. } else {
  236. int sumi_m0 = 0;
  237. #pragma unroll
  238. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  239. sumi_m0 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m0);
  240. }
  241. sumf_d8 -= dm2f0.y * sumi_m0;
  242. int sumi_m1 = 0;
  243. #pragma unroll
  244. for (int i = i0 + QI8_1/2; i < i0 + QI8_1; ++i) {
  245. sumi_m1 = ggml_cuda_dp4a(0x01010101, u[i], sumi_m1);
  246. }
  247. sumf_d8 -= dm2f1.y * sumi_m1;
  248. }
  249. }
  250. return sumf + d8*sumf_d8;
  251. }
  252. #define VDR_Q3_K_Q8_1_MMVQ 1
  253. #define VDR_Q3_K_Q8_1_MMQ 2
  254. // contiguous v/x values
  255. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
  256. const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales,
  257. const int & scale_offset, const float & d3, const float * __restrict__ d8) {
  258. float sumf = 0.0f;
  259. #pragma unroll
  260. for (int i = 0; i < QR3_K; ++i) {
  261. const int isc = scale_offset + 2*i;
  262. const int isc_low = isc % (QK_K/32);
  263. const int sc_shift_low = 4 * (isc / (QK_K/32));
  264. const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF;
  265. const int isc_high = isc % (QK_K/64);
  266. const int sc_shift_high = 2 * (isc / (QK_K/64));
  267. const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
  268. const int sc = (sc_low | sc_high) - 32;
  269. const int vil = (vl >> (2*i)) & 0x03030303;
  270. const int vih = ((vh >> i) << 2) & 0x04040404;
  271. const int vi = __vsubss4(vil, vih);
  272. sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
  273. }
  274. return d3 * sumf;
  275. }
  276. // contiguous v/x + u/y values
  277. static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
  278. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales,
  279. const float & d3, const float & d8) {
  280. int sumi = 0;
  281. #pragma unroll
  282. for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
  283. int sumi_sc = 0;
  284. #pragma unroll
  285. for (int i = i0; i < i0 + QI8_1/2; ++i) {
  286. sumi_sc = ggml_cuda_dp4a(v[i], u[i], sumi_sc); // SIMD dot product
  287. }
  288. sumi += sumi_sc * scales[i0 / (QI8_1/2)];
  289. }
  290. return d3*d8 * sumi;
  291. }
  292. #define VDR_Q4_K_Q8_1_MMVQ 2
  293. #define VDR_Q4_K_Q8_1_MMQ 8
  294. // contiguous v/x values
  295. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
  296. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  297. const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) {
  298. float sumf_d = 0.0f;
  299. float sumf_m = 0.0f;
  300. #pragma unroll
  301. for (int i = 0; i < QR4_K; ++i) {
  302. const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
  303. const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
  304. const int dot1 = ggml_cuda_dp4a(v1i, u[2*i+1], ggml_cuda_dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
  305. const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+1], ggml_cuda_dp4a(0x01010101, u[2*i+0], 0)); // sum of u
  306. sumf_d += d8[i] * (dot1 * sc[i]);
  307. sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
  308. }
  309. const float2 dm4f = __half22float2(dm4);
  310. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  311. }
  312. // contiguous v/x + u/y values
  313. static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
  314. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  315. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  316. float sumf_d = 0.0f;
  317. float sumf_m = 0.0f;
  318. #pragma unroll
  319. for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
  320. int sumi_d = 0;
  321. #pragma unroll
  322. for (int j = 0; j < QI8_1; ++j) {
  323. sumi_d = ggml_cuda_dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
  324. }
  325. const float2 ds8f = __half22float2(ds8[i]);
  326. sumf_d += ds8f.x * (sc[i] * sumi_d);
  327. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  328. }
  329. const float2 dm4f = __half22float2(dm4);
  330. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  331. }
  332. #define VDR_Q5_K_Q8_1_MMVQ 2
  333. #define VDR_Q5_K_Q8_1_MMQ 8
  334. // contiguous v/x values
  335. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
  336. const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  337. const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
  338. float sumf_d = 0.0f;
  339. float sumf_m = 0.0f;
  340. #pragma unroll
  341. for (int i = 0; i < QR5_K; ++i) {
  342. const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
  343. const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
  344. const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
  345. const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
  346. const int v0i = vl0i | vh0i;
  347. const int v1i = vl1i | vh1i;
  348. const int dot1 = ggml_cuda_dp4a(v0i, u[2*i+0], ggml_cuda_dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
  349. const int dot2 = ggml_cuda_dp4a(0x01010101, u[2*i+0], ggml_cuda_dp4a(0x01010101, u[2*i+1], 0)); // sum of u
  350. sumf_d += d8[i] * (dot1 * sc[i]);
  351. sumf_m += d8[i] * (dot2 * m[i]);
  352. }
  353. const float2 dm5f = __half22float2(dm5);
  354. return dm5f.x*sumf_d - dm5f.y*sumf_m;
  355. }
  356. // contiguous v/x + u/y values
  357. static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
  358. const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
  359. const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
  360. float sumf_d = 0.0f;
  361. float sumf_m = 0.0f;
  362. #pragma unroll
  363. for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
  364. int sumi_d = 0;
  365. #pragma unroll
  366. for (int j = 0; j < QI8_1; ++j) {
  367. sumi_d = ggml_cuda_dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
  368. }
  369. const float2 ds8f = __half22float2(ds8[i]);
  370. sumf_d += ds8f.x * (sc[i] * sumi_d);
  371. sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val
  372. }
  373. const float2 dm4f = __half22float2(dm4);
  374. return dm4f.x*sumf_d - dm4f.y*sumf_m;
  375. }
  376. #define VDR_Q6_K_Q8_1_MMVQ 1
  377. #define VDR_Q6_K_Q8_1_MMQ 8
  378. // contiguous v/x values
  379. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
  380. const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales,
  381. const float & d, const float * __restrict__ d8) {
  382. float sumf = 0.0f;
  383. #pragma unroll
  384. for (int i = 0; i < QR6_K; ++i) {
  385. const int sc = scales[4*i];
  386. const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
  387. const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
  388. const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
  389. sumf += d8[i] * (ggml_cuda_dp4a(vi, u[i], 0) * sc); // SIMD dot product
  390. }
  391. return d*sumf;
  392. }
  393. // contiguous v/x + u/y values
  394. static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
  395. const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc,
  396. const float & d6, const float * __restrict__ d8) {
  397. float sumf_d = 0.0f;
  398. const int sc_packed = get_int_b4(sc, 0);
  399. const int8_t * sc_reg = (const int8_t *) &sc_packed;
  400. #pragma unroll
  401. for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
  402. int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
  403. #pragma unroll
  404. for (int i = i0; i < i0 + 2; ++i) {
  405. sumi_d.x = ggml_cuda_dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
  406. sumi_d.x = ggml_cuda_dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
  407. sumi_d.y = ggml_cuda_dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
  408. sumi_d.y = ggml_cuda_dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
  409. }
  410. sumf_d += d8[i0/4] * (sc_reg[i0/2+0]*sumi_d.x + sc_reg[i0/2+1]*sumi_d.y);
  411. }
  412. return d6 * sumf_d;
  413. }
  414. static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
  415. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  416. const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq + kbx;
  417. int v[VDR_Q4_0_Q8_1_MMVQ];
  418. int u[2*VDR_Q4_0_Q8_1_MMVQ];
  419. #pragma unroll
  420. for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
  421. v[i] = get_int_b2(bq4_0->qs, iqs + i);
  422. u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
  423. u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_0);
  424. }
  425. return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
  426. }
  427. static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
  428. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  429. const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq + kbx;
  430. int v[VDR_Q4_1_Q8_1_MMVQ];
  431. int u[2*VDR_Q4_1_Q8_1_MMVQ];
  432. #pragma unroll
  433. for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
  434. v[i] = get_int_b4(bq4_1->qs, iqs + i);
  435. u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
  436. u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI4_1);
  437. }
  438. return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
  439. }
  440. static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
  441. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  442. const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq + kbx;
  443. int vl[VDR_Q5_0_Q8_1_MMVQ];
  444. int vh[VDR_Q5_0_Q8_1_MMVQ];
  445. int u[2*VDR_Q5_0_Q8_1_MMVQ];
  446. #pragma unroll
  447. for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
  448. vl[i] = get_int_b2(bq5_0->qs, iqs + i);
  449. vh[i] = get_int_b2(bq5_0->qh, 0) >> (4 * (iqs + i));
  450. u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
  451. u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_0);
  452. }
  453. return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
  454. }
  455. static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
  456. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  457. const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq + kbx;
  458. int vl[VDR_Q5_1_Q8_1_MMVQ];
  459. int vh[VDR_Q5_1_Q8_1_MMVQ];
  460. int u[2*VDR_Q5_1_Q8_1_MMVQ];
  461. #pragma unroll
  462. for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
  463. vl[i] = get_int_b4(bq5_1->qs, iqs + i);
  464. vh[i] = get_int_b4(bq5_1->qh, 0) >> (4 * (iqs + i));
  465. u[2*i+0] = get_int_b4(bq8_1->qs, iqs + i);
  466. u[2*i+1] = get_int_b4(bq8_1->qs, iqs + i + QI5_1);
  467. }
  468. return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
  469. }
  470. static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
  471. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  472. const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq + kbx;
  473. int v[VDR_Q8_0_Q8_1_MMVQ];
  474. int u[VDR_Q8_0_Q8_1_MMVQ];
  475. #pragma unroll
  476. for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
  477. v[i] = get_int_b2(bq8_0->qs, iqs + i);
  478. u[i] = get_int_b4(bq8_1->qs, iqs + i);
  479. }
  480. return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
  481. }
  482. static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
  483. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  484. const block_q2_K * bq2_K = (const block_q2_K *) vbq + kbx;
  485. const int bq8_offset = QR2_K * (iqs / QI8_1);
  486. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  487. const uint8_t * scales = bq2_K->scales + scale_offset;
  488. const int v = get_int_b4(bq2_K->qs, iqs);
  489. int u[QR2_K];
  490. float d8[QR2_K];
  491. #pragma unroll
  492. for (int i = 0; i < QR2_K; ++ i) {
  493. u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  494. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  495. }
  496. return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
  497. }
  498. static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
  499. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  500. const block_q3_K * bq3_K = (const block_q3_K *) vbq + kbx;
  501. const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
  502. const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
  503. const float d = bq3_K->d;
  504. const int vl = get_int_b2(bq3_K->qs, iqs);
  505. // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
  506. const int vh = ~get_int_b2(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
  507. int u[QR3_K];
  508. float d8[QR3_K];
  509. #pragma unroll
  510. for (int i = 0; i < QR3_K; ++i) {
  511. u[i] = get_int_b4(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
  512. d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
  513. }
  514. return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
  515. }
  516. static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
  517. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  518. const block_q4_K * bq4_K = (const block_q4_K *) vbq + kbx;
  519. int v[2];
  520. int u[2*QR4_K];
  521. float d8[QR4_K];
  522. // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
  523. const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
  524. // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
  525. // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
  526. // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
  527. // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
  528. const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  529. v[0] = q4[0];
  530. v[1] = q4[4];
  531. const uint16_t * scales = (const uint16_t *)bq4_K->scales;
  532. uint16_t aux[2];
  533. const int j = bq8_offset/2;
  534. if (j < 2) {
  535. aux[0] = scales[j+0] & 0x3f3f;
  536. aux[1] = scales[j+2] & 0x3f3f;
  537. } else {
  538. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  539. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  540. }
  541. const uint8_t * sc = (const uint8_t *)aux;
  542. const uint8_t * m = sc + 2;
  543. for (int i = 0; i < QR4_K; ++i) {
  544. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  545. d8[i] = __low2float(bq8i->ds);
  546. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  547. u[2*i+0] = q8[0];
  548. u[2*i+1] = q8[4];
  549. }
  550. return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
  551. }
  552. static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
  553. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  554. const block_q5_K * bq5_K = (const block_q5_K *) vbq + kbx;
  555. int vl[2];
  556. int vh[2];
  557. int u[2*QR5_K];
  558. float d8[QR5_K];
  559. const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
  560. const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
  561. const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
  562. vl[0] = ql[0];
  563. vl[1] = ql[4];
  564. vh[0] = qh[0] >> bq8_offset;
  565. vh[1] = qh[4] >> bq8_offset;
  566. const uint16_t * scales = (const uint16_t *)bq5_K->scales;
  567. uint16_t aux[2];
  568. const int j = bq8_offset/2;
  569. if (j < 2) {
  570. aux[0] = scales[j+0] & 0x3f3f;
  571. aux[1] = scales[j+2] & 0x3f3f;
  572. } else {
  573. aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
  574. aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
  575. }
  576. const uint8_t * sc = (const uint8_t *)aux;
  577. const uint8_t * m = sc + 2;
  578. #pragma unroll
  579. for (int i = 0; i < QR5_K; ++i) {
  580. const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
  581. d8[i] = __low2float(bq8i->ds);
  582. const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
  583. u[2*i+0] = q8[0];
  584. u[2*i+1] = q8[4];
  585. }
  586. return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
  587. }
  588. static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
  589. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  590. const block_q6_K * bq6_K = (const block_q6_K *) vbq + kbx;
  591. const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
  592. const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
  593. const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
  594. const int vl = get_int_b2(bq6_K->ql, iqs);
  595. const int vh = get_int_b2(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
  596. const int8_t * scales = bq6_K->scales + scale_offset;
  597. int u[QR6_K];
  598. float d8[QR6_K];
  599. #pragma unroll
  600. for (int i = 0; i < QR6_K; ++i) {
  601. u[i] = get_int_b4(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
  602. d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
  603. }
  604. return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
  605. }
  606. #define VDR_IQ2_XXS_Q8_1_MMVQ 2
  607. #define VDR_IQ2_XXS_Q8_1_MMQ 2
  608. static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
  609. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  610. const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
  611. const int q2 = get_int_b2(bq2->qs, iqs);
  612. const uint8_t * aux8 = (const uint8_t *) &q2;
  613. const uint32_t aux32 = get_int_b2(bq2->qs, iqs + 1);
  614. int sumi = 0;
  615. #pragma unroll
  616. for (int k0 = 0; k0 < 8; k0 += 2) {
  617. const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
  618. const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
  619. const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
  620. const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
  621. const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
  622. sumi = ggml_cuda_dp4a(grid0, u0, sumi);
  623. const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
  624. const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
  625. const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
  626. sumi = ggml_cuda_dp4a(grid1, u1, sumi);
  627. }
  628. const int ls = aux32 >> 28;
  629. sumi = (ls*sumi + sumi/2)/4;
  630. const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
  631. return d * sumi;
  632. }
  633. #define VDR_IQ2_XS_Q8_1_MMVQ 2
  634. #define VDR_IQ2_XS_Q8_1_MMQ 2
  635. static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
  636. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  637. const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
  638. const int2 q2_packed = make_int2(get_int_b2(bq2->qs, iqs + 0), get_int_b2(bq2->qs, iqs + 1));
  639. const uint16_t * q2 = (const uint16_t *) &q2_packed;
  640. const int ls0 = bq2->scales[iqs/2] & 0x0F;
  641. const int ls1 = bq2->scales[iqs/2] >> 4;
  642. int sumi0 = 0;
  643. int sumi1 = 0;
  644. #pragma unroll
  645. for (int l0 = 0; l0 < 8; l0 += 2) {
  646. const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
  647. const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
  648. const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
  649. const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
  650. const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
  651. const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
  652. if (l0 < 4) {
  653. sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
  654. sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
  655. } else {
  656. sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
  657. sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
  658. }
  659. }
  660. const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
  661. const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
  662. return d * sumi;
  663. }
  664. #define VDR_IQ2_S_Q8_1_MMVQ 2
  665. #define VDR_IQ2_S_Q8_1_MMQ 2
  666. static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
  667. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  668. const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
  669. const int qs_packed = get_int_b2(bq2->qs, iqs/2);
  670. const uint8_t * qs = (const uint8_t *) &qs_packed;
  671. const int qh = bq2->qh[iqs/2];
  672. const int signs_packed_32 = get_int_b2(bq2->qs, QK_K/32 + iqs/2);
  673. const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
  674. const int ls0 = bq2->scales[iqs/2] & 0x0F;
  675. const int ls1 = bq2->scales[iqs/2] >> 4;
  676. int sumi0 = 0;
  677. int sumi1 = 0;
  678. #pragma unroll
  679. for (int l0 = 0; l0 < 8; l0 += 2) {
  680. const int * grid_pos = (const int *)(iq2s_grid + (qs[l0/2] | ((qh << (8-l0)) & 0x300)));
  681. const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
  682. const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
  683. const int grid_l = __vsub4(grid_pos[0] ^ signs0, signs0);
  684. const int grid_h = __vsub4(grid_pos[1] ^ signs1, signs1);
  685. const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
  686. const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
  687. if (l0 < 4) {
  688. sumi0 = ggml_cuda_dp4a(grid_l, u0, sumi0);
  689. sumi0 = ggml_cuda_dp4a(grid_h, u1, sumi0);
  690. } else {
  691. sumi1 = ggml_cuda_dp4a(grid_l, u0, sumi1);
  692. sumi1 = ggml_cuda_dp4a(grid_h, u1, sumi1);
  693. }
  694. }
  695. const int sumi = (sumi0*ls0 + sumi1*ls1 + (sumi0 + sumi1)/2)/4;
  696. const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
  697. return d * sumi;
  698. }
  699. #define VDR_IQ3_XXS_Q8_1_MMVQ 2
  700. #define VDR_IQ3_XXS_Q8_1_MMQ 2
  701. static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
  702. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  703. const block_iq3_xxs * bq3 = (const block_iq3_xxs *) vbq + kbx;
  704. const int2 q3_packed = make_int2(get_int_b2(bq3->qs, iqs), get_int_b2(bq3->qs, iqs+1));
  705. const uint8_t * q3 = (const uint8_t *) &q3_packed;
  706. const uint32_t aux32 = get_int_b2(bq3->qs, QK_K/16 + iqs/2);
  707. int sumi = 0;
  708. #pragma unroll
  709. for (int l0 = 0; l0 < 8; l0 += 2) {
  710. const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
  711. const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
  712. const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
  713. const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
  714. const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
  715. const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
  716. sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
  717. sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
  718. }
  719. const int ls = aux32 >> 28;
  720. sumi = (ls*sumi + sumi/2)/2;
  721. const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
  722. return d * sumi;
  723. }
  724. #define VDR_IQ3_S_Q8_1_MMVQ 2
  725. #define VDR_IQ3_S_Q8_1_MMQ 2
  726. // TODO: don't use lookup table for signs
  727. static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
  728. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  729. const block_iq3_s * bq3 = (const block_iq3_s *) vbq + kbx;
  730. const int2 qs_packed = make_int2(get_int_b2(bq3->qs, iqs + 0), get_int_b2(bq3->qs, iqs + 1));
  731. const uint8_t * qs = (const uint8_t *) &qs_packed;
  732. const int qh = bq3->qh[iqs/2];
  733. const int signs_packed_32 = get_int_b2(bq3->signs, iqs/2);
  734. const uint8_t * signs_packed_8 = (const uint8_t *) &signs_packed_32;
  735. int sumi = 0;
  736. #pragma unroll
  737. for (int l0 = 0; l0 < 8; l0 += 2) {
  738. const int2 grid_pos = make_int2(
  739. iq3s_grid[qs[l0 + 0] | ((qh << (8 - l0)) & 0x100)],
  740. iq3s_grid[qs[l0 + 1] | ((qh << (7 - l0)) & 0x100)]);
  741. const int signs0 = __vcmpne4(((signs_packed_8[l0/2] & 0x03) << 7) | ((signs_packed_8[l0/2] & 0x0C) << 21), 0x00000000);
  742. const int signs1 = __vcmpne4(((signs_packed_8[l0/2] & 0x30) << 3) | ((signs_packed_8[l0/2] & 0xC0) << 17), 0x00000000);
  743. const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
  744. const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
  745. const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
  746. const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
  747. sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
  748. sumi = ggml_cuda_dp4a(grid_h, u1, sumi);
  749. }
  750. sumi *= 1 + 2*((bq3->scales[iqs/4] >> ((iqs << 1) & 0x04)) & 0x0F);
  751. const float d = __half2float(bq3->d) * __low2float(bq8_1[iqs/2].ds);
  752. return d * sumi;
  753. }
  754. #define VDR_IQ1_S_Q8_1_MMVQ 1
  755. #define VDR_IQ1_S_Q8_1_MMQ 1
  756. static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
  757. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  758. const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
  759. const int qs_packed = get_int_b2(bq1->qs, iqs);
  760. const uint8_t * qs = (const uint8_t *) &qs_packed;
  761. const int qh = bq1->qh[iqs];
  762. int sumi = 0;
  763. #pragma unroll
  764. for (int l0 = 0; l0 < 8; l0 += 2) {
  765. const int grid = iq1s_grid_gpu[qs[l0/2] | (((qh >> 3*(l0/2)) & 0x07) << 8)];
  766. const int grid0 = (grid >> 0) & 0x0F0F0F0F;
  767. const int grid1 = (grid >> 4) & 0x0F0F0F0F;
  768. const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
  769. const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
  770. sumi = ggml_cuda_dp4a(grid0, u0, sumi);
  771. sumi = ggml_cuda_dp4a(grid1, u1, sumi);
  772. }
  773. const float d1q = __half2float(bq1->d) * (((qh >> 11) & 0x0E) + 1);
  774. const float delta = -1.0f + IQ1S_DELTA - (qh & 0x8000) * (2.0f*IQ1S_DELTA/0x8000);
  775. const float2 ds = __half22float2(bq8_1[iqs].ds);
  776. return d1q * (ds.x*sumi + ds.y*delta);
  777. }
  778. #define VDR_IQ1_M_Q8_1_MMVQ 1
  779. #define VDR_IQ1_M_Q8_1_MMQ 1
  780. static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
  781. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  782. const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
  783. const int qs_packed = get_int_b4(bq1->qs, iqs);
  784. const uint8_t * qs = (const uint8_t *) &qs_packed;
  785. int sumi[2] = {0};
  786. float sumf[2] = {0.0f};
  787. #pragma unroll
  788. for (int l0 = 0; l0 < 8; l0 += 2) {
  789. const int qhl = bq1->qh[2*iqs + l0/4] >> (4 * ((l0/2) % 2));
  790. const int grid = iq1s_grid_gpu[qs[l0/2] | ((qhl & 0x07) << 8)];
  791. const int grid0 = (grid >> 0) & 0x0F0F0F0F;
  792. const int grid1 = (grid >> 4) & 0x0F0F0F0F;
  793. const int u0 = get_int_b4(bq8_1[iqs].qs, l0 + 0);
  794. const int u1 = get_int_b4(bq8_1[iqs].qs, l0 + 1);
  795. sumi[l0/4] = ggml_cuda_dp4a(grid0, u0, sumi[l0/4]);
  796. sumi[l0/4] = ggml_cuda_dp4a(grid1, u1, sumi[l0/4]);
  797. const float delta = -1.0f + IQ1M_DELTA - (qhl & 0x08) * (2.0f*IQ1M_DELTA/0x08);
  798. int sumy = 0;
  799. sumy = ggml_cuda_dp4a(u0, 0x01010101, sumy);
  800. sumy = ggml_cuda_dp4a(u1, 0x01010101, sumy);
  801. sumf[l0/4] += delta*sumy;
  802. }
  803. const uint16_t * sc = (const uint16_t *) bq1->scales;
  804. iq1m_scale_t scale;
  805. scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00F0) | ((sc[2] >> 4) & 0x0F00) | (sc[3] & 0xF000);
  806. const float d = __half2float(scale.f16) * __low2float(bq8_1[iqs].ds);
  807. const int tmp = sc[iqs/2] >> (6*(iqs%2));
  808. const int sc0 = 2*((tmp >> 0) & 0x07) + 1;
  809. const int sc1 = 2*((tmp >> 3) & 0x07) + 1;
  810. return d * ((sumi[0] + sumf[0]) * sc0 + (sumi[1] + sumf[1]) * sc1);
  811. }
  812. static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4) {
  813. const int q0_32 = (q4 >> 0) & 0x0F0F0F0F;
  814. const int8_t * q0_8 = (const int8_t *) &q0_32;
  815. const char4 val0_8 = make_char4(
  816. kvalues_iq4nl[q0_8[0]], kvalues_iq4nl[q0_8[1]], kvalues_iq4nl[q0_8[2]], kvalues_iq4nl[q0_8[3]]);
  817. const int q1_32 = (q4 >> 4) & 0x0F0F0F0F;
  818. const int8_t * q1_8 = (const int8_t *) &q1_32;
  819. const char4 val1_8 = make_char4(
  820. kvalues_iq4nl[q1_8[0]], kvalues_iq4nl[q1_8[1]], kvalues_iq4nl[q1_8[2]], kvalues_iq4nl[q1_8[3]]);
  821. return make_int2(*((const int *) &val0_8), *((const int *) &val1_8));
  822. }
  823. #define VDR_IQ4_NL_Q8_1_MMVQ 2
  824. #define VDR_IQ4_NL_Q8_1_MMQ 4
  825. static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
  826. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  827. const block_iq4_nl * bq4 = (const block_iq4_nl *) vbq + kbx;
  828. const int * q8 = (const int *) bq8_1->qs + iqs;
  829. int sumi = 0;
  830. #pragma unroll
  831. for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
  832. const int aux_q4 = get_int_b2(bq4->qs, iqs + l);
  833. const int2 v = get_int_from_table_16(aux_q4);
  834. sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
  835. sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
  836. }
  837. const float d = __half2float(bq4->d) * __low2float(bq8_1->ds);
  838. return d * sumi;
  839. }
  840. #define VDR_IQ4_XS_Q8_1_MMVQ 4
  841. #define VDR_IQ4_XS_Q8_1_MMQ 4
  842. static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
  843. const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
  844. const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
  845. int sumi = 0;
  846. #pragma unroll
  847. for (int j = 0; j < 4; ++j) {
  848. const int aux_q4 = get_int_b4(bq4->qs, iqs + j);
  849. const int2 v = get_int_from_table_16(aux_q4);
  850. const int u0 = get_int_b4(bq8_1[iqs/4].qs, j + 0);
  851. const int u1 = get_int_b4(bq8_1[iqs/4].qs, j + 4);
  852. sumi = ggml_cuda_dp4a(v.x, u0, sumi);
  853. sumi = ggml_cuda_dp4a(v.y, u1, sumi);
  854. }
  855. const int ls = ((bq4->scales_l[iqs/8] >> (iqs & 0x04)) & 0x0F) | (((bq4->scales_h >> (iqs/2)) & 0x03) << 4);
  856. sumi *= ls - 32;
  857. const float d = __half2float(bq4->d) * __low2float(bq8_1[iqs/4].ds);
  858. return d * sumi;
  859. }