im2col.cu 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. /**
  2. * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "im2col.cuh"
  27. template <typename T>
  28. static __global__ void im2col_kernel(
  29. const float * x, T * dst, int64_t batch_offset,
  30. int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
  31. int s0, int s1, int p0, int p1, int d0, int d1) {
  32. const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
  33. if (i >= pelements) {
  34. return;
  35. }
  36. const int64_t ksize = OW * (KH > 1 ? KW : 1);
  37. const int64_t kx = i / ksize;
  38. const int64_t kd = kx * ksize;
  39. const int64_t ky = (i - kd) / OW;
  40. const int64_t ix = i % OW;
  41. const int64_t oh = blockIdx.y;
  42. const int64_t batch = blockIdx.z / IC;
  43. const int64_t ic = blockIdx.z % IC;
  44. const int64_t iiw = ix * s0 + kx * d0 - p0;
  45. const int64_t iih = oh * s1 + ky * d1 - p1;
  46. const int64_t offset_dst =
  47. ((batch * OH + oh) * OW + ix) * CHW +
  48. (ic * (KW * KH) + ky * KW + kx);
  49. if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
  50. dst[offset_dst] = 0.0f;
  51. } else {
  52. const int64_t offset_src = ic * offset_delta + batch * batch_offset;
  53. dst[offset_dst] = x[offset_src + iih * IW + iiw];
  54. }
  55. }
  56. template <typename T>
  57. static void im2col_cuda(const float * x, T* dst,
  58. int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
  59. int64_t batch, int64_t batch_offset, int64_t offset_delta,
  60. int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
  61. const int parallel_elements = OW * KW * KH;
  62. const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
  63. dim3 block_nums(num_blocks, OH, batch * IC);
  64. im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
  65. }
  66. static void im2col_cuda_f16(const float * x, half * dst,
  67. int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
  68. int64_t batch, int64_t batch_offset, int64_t offset_delta,
  69. int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
  70. im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
  71. }
  72. static void im2col_cuda_f32(const float * x, float * dst,
  73. int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
  74. int64_t batch, int64_t batch_offset, int64_t offset_delta,
  75. int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
  76. im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
  77. }
  78. void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  79. const ggml_tensor * src0 = dst->src[0];
  80. const ggml_tensor * src1 = dst->src[1];
  81. const float * src1_d = (const float *)src1->data;
  82. float * dst_d = (float *)dst->data;
  83. cudaStream_t stream = ctx.stream();
  84. GGML_ASSERT(src0->type == GGML_TYPE_F16);
  85. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  86. GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
  87. const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
  88. const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
  89. const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
  90. const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
  91. const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
  92. const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
  93. const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
  94. const int64_t IC = src1->ne[is_2D ? 2 : 1];
  95. const int64_t IH = is_2D ? src1->ne[1] : 1;
  96. const int64_t IW = src1->ne[0];
  97. const int64_t KH = is_2D ? src0->ne[1] : 1;
  98. const int64_t KW = src0->ne[0];
  99. const int64_t OH = is_2D ? dst->ne[2] : 1;
  100. const int64_t OW = dst->ne[1];
  101. const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
  102. const int64_t batch = src1->ne[3];
  103. const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
  104. if(dst->type == GGML_TYPE_F16) {
  105. im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
  106. } else {
  107. im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
  108. }
  109. }