123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- /**
- * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
- *
- * MIT License
- *
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
- #include "pool2d.cuh"
- template <typename Ti, typename To>
- static __global__ void pool2d_nchw_kernel(
- const int ih, const int iw, const int oh, const int ow,
- const int kh, const int kw, const int sh, const int sw,
- const int ph, const int pw, const int parallel_elements,
- const Ti* src, To* dst, const enum ggml_op_pool op) {
- int idx = threadIdx.x + blockIdx.x * blockDim.x;
- if (idx >= parallel_elements) {
- return;
- }
- const int I_HW = ih * iw;
- const int O_HW = oh * ow;
- const int nc = idx / O_HW;
- const int cur_oh = idx % O_HW / ow;
- const int cur_ow = idx % O_HW % ow;
- const Ti* i_ptr = src + nc * I_HW;
- To* o_ptr = dst + nc * O_HW;
- const int start_h = cur_oh * sh - ph;
- const int bh = max(0, start_h);
- const int eh = min(ih, start_h + kh);
- const int start_w = cur_ow * sw - pw;
- const int bw = max(0, start_w);
- const int ew = min(iw, start_w + kw);
- const To scale = 1. / (kh * kw);
- To res = 0;
- switch (op) {
- case GGML_OP_POOL_AVG: res = 0; break;
- case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
- default: assert(false);
- }
- for (int i = bh; i < eh; i += 1) {
- for (int j = bw; j < ew; j += 1) {
- #if __CUDA_ARCH__ >= 350
- Ti cur = __ldg(i_ptr + i * iw + j);
- #else
- Ti cur = i_ptr[i * iw + j];
- #endif
- switch (op) {
- case GGML_OP_POOL_AVG: res += cur * scale; break;
- case GGML_OP_POOL_MAX: res = max(res, (To)cur); break;
- default: assert(false);
- }
- }
- }
- o_ptr[cur_oh * ow + cur_ow] = res;
- }
- static void pool2d_nchw_kernel_f32_f32_cuda(
- const int ih, const int iw, const int oh, const int ow,
- const int kh, const int kw, const int sh, const int sw,
- const int ph, const int pw, const int parallel_elements,
- const float * src, float * dst, const enum ggml_op_pool op,
- cudaStream_t stream) {
- const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE;
- dim3 block_nums(num_blocks);
- pool2d_nchw_kernel<<<block_nums, CUDA_POOL2D_BLOCK_SIZE, 0, stream>>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op);
- }
- void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
- const ggml_tensor * src0 = dst->src[0];
- const float * src0_d = (const float *)src0->data;
- float * dst_d = (float *)dst->data;
- cudaStream_t stream = ctx.stream();
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
- const int32_t * opts = (const int32_t *)dst->op_params;
- enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
- const int k0 = opts[1];
- const int k1 = opts[2];
- const int s0 = opts[3];
- const int s1 = opts[4];
- const int p0 = opts[5];
- const int p1 = opts[6];
- const int64_t IH = src0->ne[1];
- const int64_t IW = src0->ne[0];
- const int64_t N = dst->ne[3];
- const int64_t OC = dst->ne[2];
- const int64_t OH = dst->ne[1];
- const int64_t OW = dst->ne[0];
- const int parallel_elements = N * OC * OH * OW;
- pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream);
- }
|