argsort.cu 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #include "argsort.cuh"
  2. template<typename T>
  3. static inline __device__ void ggml_cuda_swap(T & a, T & b) {
  4. T tmp = a;
  5. a = b;
  6. b = tmp;
  7. }
  8. template<ggml_sort_order order>
  9. static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
  10. // bitonic sort
  11. int col = threadIdx.x;
  12. int row = blockIdx.y;
  13. if (col >= ncols_pad) {
  14. return;
  15. }
  16. const float * x_row = x + row * ncols;
  17. extern __shared__ int dst_row[];
  18. // initialize indices
  19. dst_row[col] = col;
  20. __syncthreads();
  21. for (int k = 2; k <= ncols_pad; k *= 2) {
  22. for (int j = k / 2; j > 0; j /= 2) {
  23. int ixj = col ^ j;
  24. if (ixj > col) {
  25. if ((col & k) == 0) {
  26. if (dst_row[col] >= ncols ||
  27. (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ?
  28. x_row[dst_row[col]] > x_row[dst_row[ixj]] :
  29. x_row[dst_row[col]] < x_row[dst_row[ixj]]))
  30. ) {
  31. ggml_cuda_swap(dst_row[col], dst_row[ixj]);
  32. }
  33. } else {
  34. if (dst_row[ixj] >= ncols ||
  35. (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ?
  36. x_row[dst_row[col]] < x_row[dst_row[ixj]] :
  37. x_row[dst_row[col]] > x_row[dst_row[ixj]]))
  38. ) {
  39. ggml_cuda_swap(dst_row[col], dst_row[ixj]);
  40. }
  41. }
  42. }
  43. __syncthreads();
  44. }
  45. }
  46. // copy the result to dst without the padding
  47. if (col < ncols) {
  48. dst[row * ncols + col] = dst_row[col];
  49. }
  50. }
  51. static int next_power_of_2(int x) {
  52. int n = 1;
  53. while (n < x) {
  54. n *= 2;
  55. }
  56. return n;
  57. }
  58. static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
  59. // bitonic sort requires ncols to be power of 2
  60. const int ncols_pad = next_power_of_2(ncols);
  61. const dim3 block_dims(ncols_pad, 1, 1);
  62. const dim3 block_nums(1, nrows, 1);
  63. const size_t shared_mem = ncols_pad * sizeof(int);
  64. GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);
  65. if (order == GGML_SORT_ORDER_ASC) {
  66. k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
  67. } else if (order == GGML_SORT_ORDER_DESC) {
  68. k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
  69. } else {
  70. GGML_ASSERT(false);
  71. }
  72. }
  73. void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  74. const ggml_tensor * src0 = dst->src[0];
  75. const float * src0_d = (const float *)src0->data;
  76. float * dst_d = (float *)dst->data;
  77. cudaStream_t stream = ctx.stream();
  78. GGML_ASSERT(src0->type == GGML_TYPE_F32);
  79. GGML_ASSERT( dst->type == GGML_TYPE_I32);
  80. GGML_ASSERT(ggml_is_contiguous(src0));
  81. const int64_t ncols = src0->ne[0];
  82. const int64_t nrows = ggml_nrows(src0);
  83. enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
  84. argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
  85. }