123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371 |
- /**
- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
- #include "common.cuh"
- #include "fattn-common.cuh"
- #include "fattn-tile-f32.cuh"
- #define FATTN_KQ_STRIDE_TILE_F32 32
- template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
- #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
- __launch_bounds__(nwarps*WARP_SIZE, 1)
- #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
- static __global__ void flash_attn_tile_ext_f32(
- const char * __restrict__ Q,
- const char * __restrict__ K,
- const char * __restrict__ V,
- const char * __restrict__ mask,
- float * __restrict__ dst,
- float2 * __restrict__ dst_meta,
- const float scale,
- const float max_bias,
- const float m0,
- const float m1,
- const uint32_t n_head_log2,
- const float logit_softcap,
- const int ne00,
- const int ne01,
- const int ne02,
- const int ne03,
- const int ne10,
- const int ne11,
- const int ne12,
- const int ne13,
- const int ne31,
- const int nb31,
- const int nb01,
- const int nb02,
- const int nb03,
- const int nb11,
- const int nb12,
- const int nb13,
- const int nb21,
- const int nb22,
- const int nb23,
- const int ne0,
- const int ne1,
- const int ne2,
- const int ne3) {
- // Skip unused kernel variants for faster compilation:
- if (use_logit_softcap && !(D == 128 || D == 256)) {
- NO_DEVICE_CODE;
- return;
- }
- //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
- const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
- const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
- const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
- const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
- const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
- const half2 * V_h2 = (const half2 *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
- const half * maskh = (const half *) mask + ne11*ic0;
- const int stride_KV2 = nb11 / sizeof(half2);
- const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
- static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
- __shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
- __shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
- float2 * KV_tmp2 = (float2 *) KV_tmp;
- float kqmax[ncols/nwarps];
- #pragma unroll
- for (int j0 = 0; j0 < ncols; j0 += nwarps) {
- kqmax[j0/nwarps] = -FLT_MAX/2.0f;
- }
- float kqsum[ncols/nwarps] = {0.0f};
- float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
- // Convert Q to half2 and store in registers:
- __shared__ float Q_f[ncols][D];
- #pragma unroll
- for (int j0 = 0; j0 < ncols; j0 += nwarps) {
- const int j = j0 + threadIdx.y;
- #pragma unroll
- for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
- float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
- Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
- Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
- }
- }
- __syncthreads();
- const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
- for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
- // Calculate KQ tile and keep track of new maximum KQ values:
- float kqmax_new[ncols/nwarps];
- #pragma unroll
- for (int j = 0; j < ncols/nwarps; ++j) {
- kqmax_new[j] = kqmax[j];
- }
- #pragma unroll
- for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
- const int i_KQ = i_KQ_0 + threadIdx.y;
- #pragma unroll
- for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
- const half2 tmp = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
- KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
- KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
- }
- }
- __syncthreads();
- float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
- #pragma unroll
- for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
- float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
- float Q_k[ncols/nwarps];
- #pragma unroll
- for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
- const int i_KQ = i_KQ_0 + threadIdx.x;
- K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
- }
- #pragma unroll
- for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
- const int j_KQ = j_KQ_0 + threadIdx.y;
- Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
- }
- #pragma unroll
- for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
- #pragma unroll
- for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
- sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
- }
- }
- }
- #pragma unroll
- for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
- const int i_KQ = i_KQ_0 + threadIdx.x;
- #pragma unroll
- for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
- const int j_KQ = j_KQ_0 + threadIdx.y;
- if (use_logit_softcap) {
- sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
- }
- sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
- kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
- KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
- }
- }
- __syncthreads();
- #pragma unroll
- for (int j0 = 0; j0 < ncols; j0 += nwarps) {
- const int j = j0 + threadIdx.y;
- kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
- const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
- kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
- float kqsum_add = 0.0f;
- #pragma unroll
- for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
- const int i = i0 + threadIdx.x;
- const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
- const float val = expf(diff);
- kqsum_add += val;
- KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
- }
- kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
- #pragma unroll
- for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
- VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
- VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
- }
- }
- __syncthreads();
- #pragma unroll
- for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
- const int k = k0 + threadIdx.y;
- #pragma unroll
- for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
- const int i = i0 + threadIdx.x;
- KV_tmp2[k*(D/2) + i].x = __low2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
- KV_tmp2[k*(D/2) + i].y = __high2float(V_h2[(k_VKQ_0 + k)*stride_KV2 + i]);
- }
- }
- __syncthreads();
- #pragma unroll
- for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
- float2 V_k[(D/2)/WARP_SIZE];
- float KQ_k[ncols/nwarps];
- #pragma unroll
- for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
- const int i = i0 + threadIdx.x;
- V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
- }
- #pragma unroll
- for (int j0 = 0; j0 < ncols; j0 += nwarps) {
- const int j = j0 + threadIdx.y;
- KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
- }
- #pragma unroll
- for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
- #pragma unroll
- for (int j0 = 0; j0 < ncols; j0 += nwarps) {
- VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
- VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
- }
- }
- }
- __syncthreads();
- }
- #pragma unroll
- for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
- const int j_VKQ = j_VKQ_0 + threadIdx.y;
- if (ic0 + j_VKQ >= ne01) {
- return;
- }
- float kqsum_j = kqsum[j_VKQ_0/nwarps];
- kqsum_j = warp_reduce_sum(kqsum_j);
- #pragma unroll
- for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
- const int i0 = i00 + 2*threadIdx.x;
- float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
- if (parallel_blocks == 1) {
- dst_val.x /= kqsum_j;
- dst_val.y /= kqsum_j;
- }
- const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
- dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
- dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
- }
- if (parallel_blocks != 1 && threadIdx.x == 0) {
- dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
- }
- }
- }
- template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
- void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
- const ggml_tensor * Q = dst->src[0];
- switch (Q->ne[0]) {
- case 64: {
- constexpr int D = 64;
- constexpr int nwarps = 8;
- fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
- launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
- } break;
- case 128: {
- constexpr int D = 128;
- constexpr int nwarps = 8;
- fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
- launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
- } break;
- default: {
- GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
- } break;
- }
- }
- void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
- const ggml_tensor * KQV = dst;
- const ggml_tensor * Q = dst->src[0];
- float logit_softcap;
- memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
- if (Q->ne[1] <= 16) {
- constexpr int cols_per_block = 16;
- constexpr int parallel_blocks = 4;
- if (logit_softcap == 0.0f) {
- constexpr bool use_logit_softcap = false;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- } else {
- constexpr bool use_logit_softcap = true;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- }
- return;
- }
- if (Q->ne[1] <= 32) {
- constexpr int cols_per_block = 32;
- constexpr int parallel_blocks = 4;
- if (logit_softcap == 0.0f) {
- constexpr bool use_logit_softcap = false;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- } else {
- constexpr bool use_logit_softcap = true;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- }
- return;
- }
- constexpr int cols_per_block = 32;
- constexpr int parallel_blocks = 1;
- if (logit_softcap == 0.0f) {
- constexpr bool use_logit_softcap = false;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- } else {
- constexpr bool use_logit_softcap = true;
- launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
- }
- }
|