fattn.cu 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. /**
  2. * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "common.cuh"
  27. #include "fattn-common.cuh"
  28. #include "fattn-tile-f16.cuh"
  29. #include "fattn-tile-f32.cuh"
  30. #include "fattn-vec-f16.cuh"
  31. #include "fattn-vec-f32.cuh"
  32. #include "fattn-wmma-f16.cuh"
  33. #include "fattn.cuh"
  34. #include <cstdint>
  35. static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  36. const ggml_tensor * KQV = dst;
  37. const ggml_tensor * Q = dst->src[0];
  38. const int32_t precision = KQV->op_params[3];
  39. if (precision != GGML_PREC_DEFAULT) {
  40. if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
  41. constexpr int cols_per_block = 16;
  42. switch (Q->ne[0]) {
  43. case 64:
  44. ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
  45. break;
  46. case 80:
  47. ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
  48. break;
  49. case 96:
  50. ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
  51. break;
  52. case 112:
  53. ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
  54. break;
  55. case 128:
  56. ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
  57. break;
  58. case 256:
  59. ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
  60. break;
  61. default:
  62. GGML_ABORT("fatal error");
  63. break;
  64. }
  65. } else {
  66. constexpr int cols_per_block = 32;
  67. switch (Q->ne[0]) {
  68. case 64:
  69. ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, float>(ctx, dst);
  70. break;
  71. case 80:
  72. ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, float>(ctx, dst);
  73. break;
  74. case 96:
  75. ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, float>(ctx, dst);
  76. break;
  77. case 112:
  78. ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, float>(ctx, dst);
  79. break;
  80. case 128:
  81. ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
  82. break;
  83. // case 256:
  84. // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
  85. // break;
  86. default:
  87. GGML_ABORT("fatal error");
  88. break;
  89. }
  90. }
  91. return;
  92. }
  93. if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) {
  94. constexpr int cols_per_block = 8;
  95. switch (Q->ne[0]) {
  96. case 64:
  97. ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
  98. break;
  99. case 96:
  100. ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
  101. break;
  102. case 128:
  103. ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
  104. break;
  105. case 256:
  106. ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
  107. break;
  108. default:
  109. GGML_ABORT("fatal error");
  110. break;
  111. }
  112. return;
  113. }
  114. if (Q->ne[1] <= 32) {
  115. constexpr int cols_per_block = 16;
  116. switch (Q->ne[0]) {
  117. case 64:
  118. ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
  119. break;
  120. case 80:
  121. ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
  122. break;
  123. case 96:
  124. ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
  125. break;
  126. case 112:
  127. ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
  128. break;
  129. case 128:
  130. ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
  131. break;
  132. case 256:
  133. ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
  134. break;
  135. default:
  136. GGML_ABORT("fatal error");
  137. break;
  138. }
  139. return;
  140. }
  141. constexpr int cols_per_block = 32;
  142. switch (Q->ne[0]) {
  143. case 64:
  144. ggml_cuda_flash_attn_ext_wmma_f16_case< 64, cols_per_block, half>(ctx, dst);
  145. break;
  146. case 80:
  147. ggml_cuda_flash_attn_ext_wmma_f16_case< 80, cols_per_block, half>(ctx, dst);
  148. break;
  149. case 96:
  150. ggml_cuda_flash_attn_ext_wmma_f16_case< 96, cols_per_block, half>(ctx, dst);
  151. break;
  152. case 112:
  153. ggml_cuda_flash_attn_ext_wmma_f16_case<112, cols_per_block, half>(ctx, dst);
  154. break;
  155. case 128:
  156. ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, half>(ctx, dst);
  157. break;
  158. case 256:
  159. ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
  160. break;
  161. default:
  162. GGML_ABORT("fatal error");
  163. break;
  164. }
  165. }
  166. #define FATTN_VEC_F16_CASE(D, type_K, type_V) \
  167. if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
  168. ggml_cuda_flash_attn_ext_vec_f16_case<D, type_K, type_V>(ctx, dst); \
  169. return; \
  170. } \
  171. static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  172. ggml_tensor * Q = dst->src[1];
  173. ggml_tensor * K = dst->src[1];
  174. ggml_tensor * V = dst->src[2];
  175. #ifdef GGML_CUDA_FA_ALL_QUANTS
  176. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
  177. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
  178. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
  179. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
  180. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
  181. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16 )
  182. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
  183. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
  184. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
  185. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
  186. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
  187. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
  188. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
  189. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
  190. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
  191. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
  192. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
  193. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
  194. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
  195. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
  196. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
  197. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
  198. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
  199. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
  200. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
  201. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
  202. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
  203. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
  204. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
  205. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
  206. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
  207. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
  208. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
  209. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
  210. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
  211. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
  212. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
  213. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
  214. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
  215. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
  216. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
  217. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
  218. FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
  219. #else
  220. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
  221. FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
  222. FATTN_VEC_F16_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
  223. FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
  224. FATTN_VEC_F16_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
  225. #endif // GGML_CUDA_FA_ALL_QUANTS
  226. on_no_fattn_vec_case(Q->ne[0]);
  227. }
  228. #define FATTN_VEC_F32_CASE(D, type_K, type_V) \
  229. if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
  230. ggml_cuda_flash_attn_ext_vec_f32_case<D, type_K, type_V>(ctx, dst); \
  231. return; \
  232. } \
  233. static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  234. ggml_tensor * Q = dst->src[1];
  235. ggml_tensor * K = dst->src[1];
  236. ggml_tensor * V = dst->src[2];
  237. #ifdef GGML_CUDA_FA_ALL_QUANTS
  238. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_0)
  239. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q4_1)
  240. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_0)
  241. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q5_1)
  242. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_Q8_0)
  243. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
  244. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
  245. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0)
  246. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0)
  247. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0)
  248. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0)
  249. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0)
  250. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1)
  251. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1)
  252. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1)
  253. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1)
  254. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1)
  255. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1)
  256. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0)
  257. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0)
  258. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0)
  259. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0)
  260. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0)
  261. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0)
  262. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1)
  263. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1)
  264. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1)
  265. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1)
  266. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_1)
  267. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1)
  268. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0)
  269. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0)
  270. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0)
  271. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0)
  272. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
  273. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0)
  274. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16)
  275. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16)
  276. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16)
  277. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16)
  278. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16)
  279. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
  280. FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
  281. #else
  282. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0)
  283. FATTN_VEC_F32_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q8_0)
  284. FATTN_VEC_F32_CASE( 64, GGML_TYPE_F16, GGML_TYPE_F16)
  285. FATTN_VEC_F32_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16)
  286. FATTN_VEC_F32_CASE(256, GGML_TYPE_F16, GGML_TYPE_F16)
  287. #endif // GGML_CUDA_FA_ALL_QUANTS
  288. on_no_fattn_vec_case(Q->ne[0]);
  289. }
  290. void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  291. const ggml_tensor * KQV = dst;
  292. const ggml_tensor * Q = dst->src[0];
  293. ggml_cuda_set_device(ctx.device);
  294. const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
  295. const int32_t precision = KQV->op_params[3];
  296. // On AMD the tile kernels perform poorly, use the vec kernel instead:
  297. if (cc >= CC_OFFSET_AMD) {
  298. if (precision == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
  299. ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
  300. } else {
  301. ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
  302. }
  303. return;
  304. }
  305. if (!fast_fp16_available(cc)) {
  306. if (Q->ne[1] <= 8) {
  307. ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
  308. } else {
  309. ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
  310. }
  311. return;
  312. }
  313. if (!fp16_mma_available(cc)) {
  314. if (Q->ne[1] <= 8) {
  315. ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
  316. } else {
  317. ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
  318. }
  319. return;
  320. }
  321. if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
  322. if (precision == GGML_PREC_DEFAULT) {
  323. ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
  324. return;
  325. } else if(Q->ne[0] <= 128) {
  326. ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
  327. return;
  328. }
  329. }
  330. ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
  331. }