common.cuh 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725
  1. /**
  2. * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #pragma once
  27. #include "ggml.h"
  28. #include "ggml-cuda.h"
  29. #include <cstdint>
  30. #include <memory>
  31. #if defined(GGML_USE_HIP)
  32. #define GGML_COMMON_DECL_HIP
  33. #define GGML_COMMON_IMPL_HIP
  34. #else
  35. #define GGML_COMMON_DECL_CUDA
  36. #define GGML_COMMON_IMPL_CUDA
  37. #if defined(GGML_USE_MUSA)
  38. #define GGML_COMMON_DECL_MUSA
  39. #define GGML_COMMON_IMPL_MUSA
  40. #endif
  41. #endif
  42. #include "ggml-common.h"
  43. #include <cstdio>
  44. #include <array>
  45. #include <cassert>
  46. #include <cfloat>
  47. #include <string>
  48. #include <vector>
  49. #if defined(GGML_USE_HIP)
  50. #include "vendors/hip.h"
  51. #elif defined(GGML_USE_MUSA)
  52. #include "vendors/musa.h"
  53. #else
  54. #include "vendors/cuda.h"
  55. #endif // defined(GGML_USE_HIP)
  56. #define STRINGIZE_IMPL(...) #__VA_ARGS__
  57. #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
  58. #define WARP_SIZE 32
  59. #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
  60. #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
  61. #define GGML_CUDA_CC_PASCAL 600
  62. #define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
  63. #define GGML_CUDA_CC_VOLTA 700
  64. #define GGML_CUDA_CC_TURING 750
  65. #define GGML_CUDA_CC_AMPERE 800
  66. #define GGML_CUDA_CC_OFFSET_AMD 1000000
  67. // GCN/CNDA, wave size is 64
  68. #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
  69. #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
  70. #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
  71. #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
  72. #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
  73. #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
  74. // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
  75. #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
  76. #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
  77. #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
  78. #define GGML_CUDA_CC_QY1 210
  79. #define GGML_CUDA_CC_QY2 220
  80. #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
  81. #if defined(_MSC_VER)
  82. #pragma warning(disable: 4244 4267) // possible loss of data
  83. #endif
  84. #define GGML_CUDA_MAX_STREAMS 8
  85. [[noreturn]]
  86. void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
  87. #define CUDA_CHECK_GEN(err, success, error_fn) \
  88. do { \
  89. auto err_ = (err); \
  90. if (err_ != (success)) { \
  91. ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
  92. } \
  93. } while (0)
  94. #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
  95. #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
  96. static const char * cublas_get_error_str(const cublasStatus_t err) {
  97. return cublasGetStatusString(err);
  98. }
  99. #else
  100. static const char * cublas_get_error_str(const cublasStatus_t err) {
  101. switch (err) {
  102. case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
  103. case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
  104. case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
  105. case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
  106. case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
  107. case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
  108. case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
  109. case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
  110. case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
  111. default: return "unknown error";
  112. }
  113. }
  114. #endif // CUDART_VERSION >= 12000
  115. #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
  116. #if !defined(GGML_USE_HIP)
  117. static const char * cu_get_error_str(CUresult err) {
  118. const char * err_str;
  119. cuGetErrorString(err, &err_str);
  120. return err_str;
  121. }
  122. #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
  123. #endif
  124. #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
  125. #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
  126. #else
  127. #define GGML_CUDA_ASSUME(x)
  128. #endif // CUDART_VERSION >= 11100
  129. #ifdef GGML_CUDA_F16
  130. typedef half dfloat; // dequantize float
  131. typedef half2 dfloat2;
  132. #else
  133. typedef float dfloat; // dequantize float
  134. typedef float2 dfloat2;
  135. #endif // GGML_CUDA_F16
  136. #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
  137. #define FP16_AVAILABLE
  138. #endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
  139. #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
  140. #define FAST_FP16_AVAILABLE
  141. #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
  142. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
  143. #define FP16_MMA_AVAILABLE
  144. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA
  145. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
  146. #define INT8_MMA_AVAILABLE
  147. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
  148. #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
  149. #define FLASH_ATTN_AVAILABLE
  150. #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
  151. static constexpr bool fast_fp16_available(const int cc) {
  152. return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
  153. }
  154. static constexpr bool fp16_mma_available(const int cc) {
  155. return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
  156. }
  157. static constexpr bool int8_mma_available(const int cc) {
  158. return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
  159. }
  160. [[noreturn]]
  161. static __device__ void no_device_code(
  162. const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
  163. #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  164. printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
  165. file_name, line, function_name, arch);
  166. GGML_UNUSED(arch_list);
  167. #else
  168. printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
  169. file_name, line, function_name, arch, arch_list);
  170. #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  171. __trap();
  172. GGML_UNUSED(no_device_code); // suppress unused function warning
  173. }
  174. #ifdef __CUDA_ARCH__
  175. #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
  176. #else
  177. #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
  178. #endif // __CUDA_ARCH__
  179. static __device__ __forceinline__ int warp_reduce_sum(int x) {
  180. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
  181. return __reduce_add_sync(0xffffffff, x);
  182. #else
  183. #pragma unroll
  184. for (int offset = 16; offset > 0; offset >>= 1) {
  185. x += __shfl_xor_sync(0xffffffff, x, offset, 32);
  186. }
  187. return x;
  188. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
  189. }
  190. static __device__ __forceinline__ float warp_reduce_sum(float x) {
  191. #pragma unroll
  192. for (int offset = 16; offset > 0; offset >>= 1) {
  193. x += __shfl_xor_sync(0xffffffff, x, offset, 32);
  194. }
  195. return x;
  196. }
  197. static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
  198. #pragma unroll
  199. for (int offset = 16; offset > 0; offset >>= 1) {
  200. a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
  201. a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
  202. }
  203. return a;
  204. }
  205. static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
  206. #ifdef FP16_AVAILABLE
  207. #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  208. #pragma unroll
  209. for (int offset = 16; offset > 0; offset >>= 1) {
  210. const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
  211. reinterpret_cast<half&>(a.x) += __low2half(a_other);
  212. reinterpret_cast<half&>(a.y) += __high2half(a_other);
  213. }
  214. return a;
  215. #else
  216. #pragma unroll
  217. for (int offset = 16; offset > 0; offset >>= 1) {
  218. a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
  219. }
  220. return a;
  221. #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  222. #else
  223. NO_DEVICE_CODE;
  224. return a;
  225. #endif // FP16_AVAILABLE
  226. }
  227. static __device__ __forceinline__ float warp_reduce_max(float x) {
  228. #pragma unroll
  229. for (int offset = 16; offset > 0; offset >>= 1) {
  230. x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
  231. }
  232. return x;
  233. }
  234. static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
  235. #ifdef FP16_AVAILABLE
  236. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
  237. return __float2half(fmaxf(__half2float(a), __half2float(b)));
  238. #else
  239. return __hmax(a, b);
  240. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
  241. #else
  242. NO_DEVICE_CODE;
  243. GGML_UNUSED(b);
  244. return a;
  245. #endif // FP16_AVAILABLE
  246. }
  247. static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
  248. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
  249. #if CUDART_VERSION >= CUDART_HMAX
  250. return __hmax2(a, b);
  251. #else
  252. half2 ret;
  253. reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
  254. reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
  255. return ret;
  256. #endif // CUDART_VERSION >= CUDART_HMAX
  257. #else
  258. GGML_UNUSED(a);
  259. GGML_UNUSED(b);
  260. NO_DEVICE_CODE;
  261. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
  262. }
  263. static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
  264. #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
  265. #pragma unroll
  266. for (int offset = 16; offset > 0; offset >>= 1) {
  267. x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
  268. }
  269. return x;
  270. #else
  271. GGML_UNUSED(x);
  272. NO_DEVICE_CODE;
  273. #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL
  274. }
  275. #if CUDART_VERSION < CUDART_HMASK
  276. static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
  277. const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
  278. const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
  279. return mask_low | mask_high;
  280. }
  281. #endif // CUDART_VERSION < CUDART_HMASK
  282. static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
  283. #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  284. #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
  285. c = __builtin_amdgcn_sdot4(a, b, c, false);
  286. #elif defined(RDNA3)
  287. c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
  288. #elif defined(__gfx1010__) || defined(__gfx900__)
  289. int tmp1;
  290. int tmp2;
  291. asm("\n \
  292. v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
  293. v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
  294. v_add3_u32 %0, %1, %2, %0 \n \
  295. v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
  296. v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
  297. v_add3_u32 %0, %1, %2, %0 \n \
  298. "
  299. : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
  300. : "v"(a), "v"(b)
  301. );
  302. #else
  303. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  304. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  305. c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
  306. #endif
  307. return c;
  308. #else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  309. #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
  310. return __dp4a(a, b, c);
  311. #else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
  312. const int8_t * a8 = (const int8_t *) &a;
  313. const int8_t * b8 = (const int8_t *) &b;
  314. return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
  315. #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
  316. #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
  317. }
  318. // TODO: move to ggml-common.h
  319. static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  320. typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
  321. static __device__ __forceinline__ float get_alibi_slope(
  322. const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
  323. ) {
  324. if (max_bias <= 0.0f) {
  325. return 1.0f;
  326. }
  327. const float base = h < n_head_log2 ? m0 : m1;
  328. const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
  329. return powf(base, exph);
  330. }
  331. template <ggml_type type>
  332. struct ggml_cuda_type_traits;
  333. template<>
  334. struct ggml_cuda_type_traits<GGML_TYPE_F16> {
  335. static constexpr int qk = 1;
  336. static constexpr int qr = 1;
  337. };
  338. template<>
  339. struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
  340. static constexpr int qk = QK4_0;
  341. static constexpr int qr = QR4_0;
  342. static constexpr int qi = QI4_0;
  343. };
  344. template<>
  345. struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
  346. static constexpr int qk = QK4_1;
  347. static constexpr int qr = QR4_1;
  348. static constexpr int qi = QI4_1;
  349. };
  350. template<>
  351. struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
  352. static constexpr int qk = QK5_0;
  353. static constexpr int qr = QR5_0;
  354. static constexpr int qi = QI5_0;
  355. };
  356. template<>
  357. struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
  358. static constexpr int qk = QK5_1;
  359. static constexpr int qr = QR5_1;
  360. static constexpr int qi = QI5_1;
  361. };
  362. template<>
  363. struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
  364. static constexpr int qk = QK8_0;
  365. static constexpr int qr = QR8_0;
  366. static constexpr int qi = QI8_0;
  367. };
  368. template<>
  369. struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
  370. static constexpr int qk = QK_K;
  371. static constexpr int qr = QR2_K;
  372. static constexpr int qi = QI2_K;
  373. };
  374. template<>
  375. struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
  376. static constexpr int qk = QK_K;
  377. static constexpr int qr = QR3_K;
  378. static constexpr int qi = QI3_K;
  379. };
  380. template<>
  381. struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
  382. static constexpr int qk = QK_K;
  383. static constexpr int qr = QR4_K;
  384. static constexpr int qi = QI4_K;
  385. };
  386. template<>
  387. struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
  388. static constexpr int qk = QK_K;
  389. static constexpr int qr = QR5_K;
  390. static constexpr int qi = QI5_K;
  391. };
  392. template<>
  393. struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
  394. static constexpr int qk = QK_K;
  395. static constexpr int qr = QR6_K;
  396. static constexpr int qi = QI6_K;
  397. };
  398. template<>
  399. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
  400. static constexpr int qk = QK_K;
  401. static constexpr int qr = QR2_XXS;
  402. static constexpr int qi = QI2_XXS;
  403. };
  404. template<>
  405. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
  406. static constexpr int qk = QK_K;
  407. static constexpr int qr = QR2_XS;
  408. static constexpr int qi = QI2_XS;
  409. };
  410. template<>
  411. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
  412. static constexpr int qk = QK_K;
  413. static constexpr int qr = QR2_S;
  414. static constexpr int qi = QI2_S;
  415. };
  416. template<>
  417. struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
  418. static constexpr int qk = QK_K;
  419. static constexpr int qr = QR3_XXS;
  420. static constexpr int qi = QI3_XXS;
  421. };
  422. template<>
  423. struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
  424. static constexpr int qk = QK_K;
  425. static constexpr int qr = QR1_S;
  426. static constexpr int qi = QI1_S;
  427. };
  428. template<>
  429. struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
  430. static constexpr int qk = QK_K;
  431. static constexpr int qr = QR1_M;
  432. static constexpr int qi = QI1_M;
  433. };
  434. template<>
  435. struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
  436. static constexpr int qk = QK4_NL;
  437. static constexpr int qr = QR4_NL;
  438. static constexpr int qi = QI4_NL;
  439. };
  440. template<>
  441. struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
  442. static constexpr int qk = QK_K;
  443. static constexpr int qr = QR4_XS;
  444. static constexpr int qi = QI4_XS;
  445. };
  446. template<>
  447. struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
  448. static constexpr int qk = QK_K;
  449. static constexpr int qr = QR3_S;
  450. static constexpr int qi = QI3_S;
  451. };
  452. //////////////////////
  453. struct ggml_cuda_device_info {
  454. int device_count;
  455. struct cuda_device_info {
  456. int cc; // compute capability
  457. int nsm; // number of streaming multiprocessors
  458. size_t smpb; // max. shared memory per block
  459. size_t smpbo; // max. shared memory per block (with opt-in)
  460. bool vmm; // virtual memory support
  461. size_t vmm_granularity; // granularity of virtual memory
  462. size_t total_vram;
  463. };
  464. cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
  465. std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
  466. };
  467. const ggml_cuda_device_info & ggml_cuda_info();
  468. void ggml_cuda_set_device(int device);
  469. int ggml_cuda_get_device();
  470. struct ggml_cuda_pool {
  471. virtual ~ggml_cuda_pool() = default;
  472. virtual void * alloc(size_t size, size_t * actual_size) = 0;
  473. virtual void free(void * ptr, size_t size) = 0;
  474. };
  475. template<typename T>
  476. struct ggml_cuda_pool_alloc {
  477. ggml_cuda_pool * pool = nullptr;
  478. T * ptr = nullptr;
  479. size_t actual_size = 0;
  480. ggml_cuda_pool_alloc() = default;
  481. explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
  482. }
  483. ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
  484. alloc(size);
  485. }
  486. ~ggml_cuda_pool_alloc() {
  487. if (ptr != nullptr) {
  488. pool->free(ptr, actual_size);
  489. }
  490. }
  491. // size is in number of elements
  492. T * alloc(size_t size) {
  493. GGML_ASSERT(pool != nullptr);
  494. GGML_ASSERT(ptr == nullptr);
  495. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  496. return ptr;
  497. }
  498. T * alloc(ggml_cuda_pool & pool, size_t size) {
  499. this->pool = &pool;
  500. return alloc(size);
  501. }
  502. T * get() {
  503. return ptr;
  504. }
  505. ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
  506. ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
  507. ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
  508. ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
  509. };
  510. // backend interface
  511. struct ggml_tensor_extra_gpu {
  512. void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
  513. cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
  514. };
  515. #if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
  516. #define USE_CUDA_GRAPH
  517. #endif
  518. struct ggml_graph_node_properties {
  519. void * node_address;
  520. ggml_op node_op;
  521. int64_t ne[GGML_MAX_DIMS];
  522. size_t nb[GGML_MAX_DIMS];
  523. void * src_address[GGML_MAX_SRC];
  524. int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
  525. };
  526. struct ggml_cuda_graph {
  527. #ifdef USE_CUDA_GRAPH
  528. ~ggml_cuda_graph() {
  529. if (instance != nullptr) {
  530. CUDA_CHECK(cudaGraphExecDestroy(instance));
  531. }
  532. if (graph != nullptr) {
  533. CUDA_CHECK(cudaGraphDestroy(graph));
  534. }
  535. }
  536. cudaGraph_t graph = nullptr;
  537. cudaGraphExec_t instance = nullptr;
  538. size_t num_nodes = 0;
  539. std::vector<cudaGraphNode_t> nodes;
  540. std::vector<cudaKernelNodeParams> params;
  541. bool disable_due_to_gpu_arch = false;
  542. bool disable_due_to_too_many_updates = false;
  543. bool disable_due_to_failed_graph_capture = false;
  544. int number_consecutive_updates = 0;
  545. std::vector<ggml_graph_node_properties> ggml_graph_properties;
  546. std::vector<char **> updated_kernel_arg;
  547. #endif
  548. };
  549. struct ggml_backend_cuda_context {
  550. int device;
  551. std::string name;
  552. cudaEvent_t copy_event = nullptr;
  553. cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
  554. cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
  555. std::unique_ptr<ggml_cuda_graph> cuda_graph;
  556. explicit ggml_backend_cuda_context(int device) :
  557. device(device),
  558. name(GGML_CUDA_NAME + std::to_string(device)) {
  559. }
  560. ~ggml_backend_cuda_context() {
  561. if (copy_event != nullptr) {
  562. CUDA_CHECK(cudaEventDestroy(copy_event));
  563. }
  564. for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
  565. for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
  566. if (streams[i][j] != nullptr) {
  567. CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
  568. }
  569. }
  570. if (cublas_handles[i] != nullptr) {
  571. CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
  572. }
  573. }
  574. }
  575. cudaStream_t stream(int device, int stream) {
  576. if (streams[device][stream] == nullptr) {
  577. ggml_cuda_set_device(device);
  578. CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
  579. }
  580. return streams[device][stream];
  581. }
  582. cudaStream_t stream() {
  583. return stream(device, 0);
  584. }
  585. cublasHandle_t cublas_handle(int device) {
  586. if (cublas_handles[device] == nullptr) {
  587. ggml_cuda_set_device(device);
  588. CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
  589. CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
  590. }
  591. return cublas_handles[device];
  592. }
  593. cublasHandle_t cublas_handle() {
  594. return cublas_handle(device);
  595. }
  596. // pool
  597. std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
  598. static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
  599. ggml_cuda_pool & pool(int device) {
  600. if (pools[device] == nullptr) {
  601. pools[device] = new_pool_for_device(device);
  602. }
  603. return *pools[device];
  604. }
  605. ggml_cuda_pool & pool() {
  606. return pool(device);
  607. }
  608. };