argsort.cu 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. /**
  2. * llama.cpp - commit 8962422b1c6f9b8b15f5aeaea42600bcc2d44177 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #include "argsort.cuh"
  27. template<typename T>
  28. static inline __device__ void ggml_cuda_swap(T & a, T & b) {
  29. T tmp = a;
  30. a = b;
  31. b = tmp;
  32. }
  33. template<ggml_sort_order order>
  34. static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
  35. // bitonic sort
  36. int col = threadIdx.x;
  37. int row = blockIdx.y;
  38. if (col >= ncols_pad) {
  39. return;
  40. }
  41. const float * x_row = x + row * ncols;
  42. extern __shared__ int dst_row[];
  43. // initialize indices
  44. dst_row[col] = col;
  45. __syncthreads();
  46. for (int k = 2; k <= ncols_pad; k *= 2) {
  47. for (int j = k / 2; j > 0; j /= 2) {
  48. int ixj = col ^ j;
  49. if (ixj > col) {
  50. if ((col & k) == 0) {
  51. if (dst_row[col] >= ncols ||
  52. (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
  53. x_row[dst_row[col]] > x_row[dst_row[ixj]] :
  54. x_row[dst_row[col]] < x_row[dst_row[ixj]]))
  55. ) {
  56. ggml_cuda_swap(dst_row[col], dst_row[ixj]);
  57. }
  58. } else {
  59. if (dst_row[ixj] >= ncols ||
  60. (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
  61. x_row[dst_row[col]] < x_row[dst_row[ixj]] :
  62. x_row[dst_row[col]] > x_row[dst_row[ixj]]))
  63. ) {
  64. ggml_cuda_swap(dst_row[col], dst_row[ixj]);
  65. }
  66. }
  67. }
  68. __syncthreads();
  69. }
  70. }
  71. // copy the result to dst without the padding
  72. if (col < ncols) {
  73. dst[row * ncols + col] = dst_row[col];
  74. }
  75. }
  76. static int next_power_of_2(int x) {
  77. int n = 1;
  78. while (n < x) {
  79. n *= 2;
  80. }
  81. return n;
  82. }
  83. static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
  84. // bitonic sort requires ncols to be power of 2
  85. const int ncols_pad = next_power_of_2(ncols);
  86. const dim3 block_dims(ncols_pad, 1, 1);
  87. const dim3 block_nums(1, nrows, 1);
  88. const size_t shared_mem = ncols_pad * sizeof(int);
  89. // FIXME: this limit could be raised by ~2-4x on Ampere or newer
  90. GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
  91. if (order == GGML_SORT_ORDER_ASC) {
  92. k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
  93. } else if (order == GGML_SORT_ORDER_DESC) {
  94. k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
  95. } else {
  96. GGML_ABORT("fatal error");
  97. }
  98. }
  99. void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  100. const ggml_tensor * src0 = dst->src[0];
  101. const float * src0_d = (const float *)src0->data;
  102. float * dst_d = (float *)dst->data;
  103. cudaStream_t stream = ctx.stream();
  104. GGML_ASSERT(src0->type == GGML_TYPE_F32);
  105. GGML_ASSERT( dst->type == GGML_TYPE_I32);
  106. GGML_ASSERT(ggml_is_contiguous(src0));
  107. const int64_t ncols = src0->ne[0];
  108. const int64_t nrows = ggml_nrows(src0);
  109. enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
  110. argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
  111. }