common.cuh 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065
  1. /**
  2. * llama.cpp - commit 6eeaeba126ff701f3e8f79f246805b7023709972 - do not edit this file
  3. *
  4. * MIT License
  5. *
  6. * Copyright (c) 2023-2024 The ggml authors
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in all
  16. * copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24. * SOFTWARE.
  25. */
  26. #pragma once
  27. #include "ggml.h"
  28. #include "ggml-cuda.h"
  29. #include <cstdint>
  30. #include <memory>
  31. #if defined(GGML_USE_HIPBLAS)
  32. #define GGML_COMMON_DECL_HIP
  33. #define GGML_COMMON_IMPL_HIP
  34. #else
  35. #define GGML_COMMON_DECL_CUDA
  36. #define GGML_COMMON_IMPL_CUDA
  37. #if defined(GGML_USE_MUSA)
  38. #define GGML_COMMON_DECL_MUSA
  39. #define GGML_COMMON_IMPL_MUSA
  40. #endif
  41. #endif
  42. #include "ggml-common.h"
  43. #include <cstdio>
  44. #include <array>
  45. #include <cassert>
  46. #include <cfloat>
  47. #include <string>
  48. #include <vector>
  49. #if defined(GGML_USE_HIPBLAS)
  50. #include <hip/hip_runtime.h>
  51. #include <hipblas/hipblas.h>
  52. #include <hip/hip_fp16.h>
  53. #ifdef __HIP_PLATFORM_AMD__
  54. // for rocblas_initialize()
  55. #include "rocblas/rocblas.h"
  56. #endif // __HIP_PLATFORM_AMD__
  57. #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
  58. #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
  59. #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
  60. #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
  61. #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
  62. #define CUBLAS_OP_N HIPBLAS_OP_N
  63. #define CUBLAS_OP_T HIPBLAS_OP_T
  64. #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
  65. #define CUBLAS_TF32_TENSOR_OP_MATH 0
  66. #define CUDA_R_16F HIPBLAS_R_16F
  67. #define CUDA_R_32F HIPBLAS_R_32F
  68. #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
  69. #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
  70. #define cublasCreate hipblasCreate
  71. #define cublasDestroy hipblasDestroy
  72. #define cublasGemmEx hipblasGemmEx
  73. #define cublasGemmBatchedEx hipblasGemmBatchedEx
  74. #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
  75. #define cublasHandle_t hipblasHandle_t
  76. #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
  77. #define cublasSetStream hipblasSetStream
  78. #define cublasSgemm hipblasSgemm
  79. #define cublasStatus_t hipblasStatus_t
  80. #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
  81. #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
  82. #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
  83. #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
  84. #define cudaDeviceProp hipDeviceProp_t
  85. #define cudaDeviceSynchronize hipDeviceSynchronize
  86. #define cudaError_t hipError_t
  87. #define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
  88. #define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
  89. #define cudaEventCreateWithFlags hipEventCreateWithFlags
  90. #define cudaEventDisableTiming hipEventDisableTiming
  91. #define cudaEventRecord hipEventRecord
  92. #define cudaEventSynchronize hipEventSynchronize
  93. #define cudaEvent_t hipEvent_t
  94. #define cudaEventDestroy hipEventDestroy
  95. #define cudaFree hipFree
  96. #define cudaFreeHost hipHostFree
  97. #define cudaGetDevice hipGetDevice
  98. #define cudaGetDeviceCount hipGetDeviceCount
  99. #define cudaGetDeviceProperties hipGetDeviceProperties
  100. #define cudaGetErrorString hipGetErrorString
  101. #define cudaGetLastError hipGetLastError
  102. #define cudaHostRegister hipHostRegister
  103. #define cudaHostRegisterPortable hipHostRegisterPortable
  104. #define cudaHostRegisterReadOnly hipHostRegisterReadOnly
  105. #define cudaHostUnregister hipHostUnregister
  106. #define cudaLaunchHostFunc hipLaunchHostFunc
  107. #define cudaMalloc hipMalloc
  108. #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
  109. #define cudaMemcpy hipMemcpy
  110. #define cudaMemcpyAsync hipMemcpyAsync
  111. #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
  112. #define cudaMemcpy2DAsync hipMemcpy2DAsync
  113. #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
  114. #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
  115. #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
  116. #define cudaMemcpyKind hipMemcpyKind
  117. #define cudaMemset hipMemset
  118. #define cudaMemsetAsync hipMemsetAsync
  119. #define cudaMemGetInfo hipMemGetInfo
  120. #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
  121. #define cudaSetDevice hipSetDevice
  122. #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
  123. #define cudaStreamDestroy hipStreamDestroy
  124. #define cudaStreamFireAndForget hipStreamFireAndForget
  125. #define cudaStreamNonBlocking hipStreamNonBlocking
  126. #define cudaStreamPerThread hipStreamPerThread
  127. #define cudaStreamSynchronize hipStreamSynchronize
  128. #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
  129. #define cudaStream_t hipStream_t
  130. #define cudaSuccess hipSuccess
  131. #define __trap() do { abort(); __builtin_unreachable(); } while(0)
  132. #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
  133. #define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
  134. #define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
  135. #define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
  136. #define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
  137. #define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
  138. #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
  139. #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
  140. #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
  141. #elif defined(GGML_USE_MUSA)
  142. #include <musa_runtime.h>
  143. #include <musa.h>
  144. #include <mublas.h>
  145. #include <musa_fp16.h>
  146. // XXX: Keep the following order the same as hipBLAS
  147. // #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
  148. // #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
  149. #define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
  150. #define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
  151. #define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
  152. #define CUBLAS_OP_N MUBLAS_OP_N
  153. #define CUBLAS_OP_T MUBLAS_OP_T
  154. #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
  155. // #define CUBLAS_TF32_TENSOR_OP_MATH 0
  156. #define CUDA_R_16F MUSA_R_16F
  157. #define CUDA_R_32F MUSA_R_32F
  158. // #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
  159. // #define cublasComputeType_t mublasComputeType_t
  160. #define cublasCreate mublasCreate
  161. #define cublasDestroy mublasDestroy
  162. #define cublasGemmEx mublasGemmEx
  163. #define cublasGemmBatchedEx mublasGemmBatchedEx
  164. #define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
  165. #define cublasHandle_t mublasHandle_t
  166. // #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
  167. #define cublasSetMathMode mublasSetMathMode
  168. #define cublasSetStream mublasSetStream
  169. #define cublasSgemm mublasSgemm
  170. #define cublasStatus_t mublasStatus_t
  171. #define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
  172. #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
  173. #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
  174. #define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
  175. #define cudaDeviceProp musaDeviceProp
  176. #define cudaDeviceSynchronize musaDeviceSynchronize
  177. #define cudaError_t musaError_t
  178. #define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
  179. #define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
  180. #define cudaEventCreateWithFlags musaEventCreateWithFlags
  181. #define cudaEventDisableTiming musaEventDisableTiming
  182. #define cudaEventRecord musaEventRecord
  183. #define cudaEventSynchronize musaEventSynchronize
  184. #define cudaEvent_t musaEvent_t
  185. #define cudaEventDestroy musaEventDestroy
  186. #define cudaFree musaFree
  187. #define cudaFreeHost musaFreeHost
  188. #define cudaGetDevice musaGetDevice
  189. #define cudaGetDeviceCount musaGetDeviceCount
  190. #define cudaGetDeviceProperties musaGetDeviceProperties
  191. #define cudaGetErrorString musaGetErrorString
  192. #define cudaGetLastError musaGetLastError
  193. #define cudaHostRegister musaHostRegister
  194. #define cudaHostRegisterPortable musaHostRegisterPortable
  195. #define cudaHostRegisterReadOnly musaHostRegisterReadOnly
  196. #define cudaHostUnregister musaHostUnregister
  197. #define cudaLaunchHostFunc musaLaunchHostFunc
  198. #define cudaMalloc musaMalloc
  199. #define cudaMallocHost musaMallocHost
  200. #define cudaMemcpy musaMemcpy
  201. #define cudaMemcpyAsync musaMemcpyAsync
  202. #define cudaMemcpyPeerAsync musaMemcpyPeerAsync
  203. #define cudaMemcpy2DAsync musaMemcpy2DAsync
  204. #define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
  205. #define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
  206. #define cudaMemcpyHostToDevice musaMemcpyHostToDevice
  207. #define cudaMemcpyKind musaMemcpyKind
  208. #define cudaMemset musaMemset
  209. #define cudaMemsetAsync musaMemsetAsync
  210. #define cudaMemGetInfo musaMemGetInfo
  211. #define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
  212. #define cudaSetDevice musaSetDevice
  213. #define cudaStreamCreateWithFlags musaStreamCreateWithFlags
  214. #define cudaStreamDestroy musaStreamDestroy
  215. #define cudaStreamFireAndForget musaStreamFireAndForget
  216. #define cudaStreamNonBlocking musaStreamNonBlocking
  217. #define cudaStreamPerThread musaStreamPerThread
  218. #define cudaStreamSynchronize musaStreamSynchronize
  219. #define cudaStreamWaitEvent musaStreamWaitEvent
  220. #define cudaStream_t musaStream_t
  221. #define cudaSuccess musaSuccess
  222. // XXX: Other CUDA => MUSA mapping
  223. #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
  224. #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
  225. #define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
  226. #define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
  227. #define CUdevice MUdevice
  228. #define CUdeviceptr MUdeviceptr
  229. #define CUmemAccessDesc MUmemAccessDesc
  230. #define CUmemAllocationProp MUmemAllocationProp
  231. #define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
  232. #define cuDeviceGet muDeviceGet
  233. #define cuDeviceGetAttribute muDeviceGetAttribute
  234. #define cuMemAddressFree muMemAddressFree
  235. #define cuMemAddressReserve muMemAddressReserve
  236. #define cuMemCreate muMemCreate
  237. #define cuMemGetAllocationGranularity muMemGetAllocationGranularity
  238. #define cuMemMap muMemMap
  239. #define cuMemRelease muMemRelease
  240. #define cuMemSetAccess muMemSetAccess
  241. #define cuMemUnmap muMemUnmap
  242. #define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
  243. #define cudaFuncSetAttribute musaFuncSetAttribute
  244. #define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
  245. #define make_cudaExtent make_musaExtent
  246. #define make_cudaPitchedPtr make_musaPitchedPtr
  247. // XXX: USE_CUDA_GRAPH
  248. #define CUDA_SUCCESS MUSA_SUCCESS
  249. #define CUresult MUresult
  250. #define cuGetErrorString muGetErrorString
  251. #define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
  252. #define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
  253. #define cudaGraphDestroy musaGraphDestroy
  254. #define cudaGraphExecDestroy musaGraphExecDestroy
  255. #define cudaGraphExec_t musaGraphExec_t
  256. #define cudaGraphExecUpdate musaGraphExecUpdate
  257. #define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
  258. #define cudaGraphGetNodes musaGraphGetNodes
  259. #define cudaGraphInstantiate musaGraphInstantiate
  260. #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
  261. #define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
  262. #define cudaGraphLaunch musaGraphLaunch
  263. #define cudaGraphNodeGetType musaGraphNodeGetType
  264. #define cudaGraphNode_t musaGraphNode_t
  265. #define cudaGraphNodeType musaGraphNodeType
  266. #define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
  267. #define cudaGraph_t musaGraph_t
  268. #define cudaKernelNodeParams musaKernelNodeParams
  269. #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
  270. #define cudaStreamEndCapture musaStreamEndCapture
  271. // XXX: cuBLAS => muBLAS mapping
  272. #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
  273. #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
  274. #define CUBLAS_COMPUTE_16F CUDA_R_16F
  275. #define CUBLAS_COMPUTE_32F CUDA_R_32F
  276. #define cublasComputeType_t cudaDataType_t
  277. // XXX: Clang builtins mapping
  278. #define __vsub4 __vsub4_musa
  279. #define __vcmpeq4 __vcmpeq4_musa
  280. #define __vcmpne4 __vcmpne4_musa
  281. #else
  282. #include <cuda_runtime.h>
  283. #include <cuda.h>
  284. #include <cublas_v2.h>
  285. #include <cuda_fp16.h>
  286. #if CUDART_VERSION < 11020
  287. #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
  288. #define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
  289. #define CUBLAS_COMPUTE_16F CUDA_R_16F
  290. #define CUBLAS_COMPUTE_32F CUDA_R_32F
  291. #define cublasComputeType_t cudaDataType_t
  292. #endif // CUDART_VERSION < 11020
  293. #endif // defined(GGML_USE_HIPBLAS)
  294. #define STRINGIZE_IMPL(...) #__VA_ARGS__
  295. #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
  296. #define WARP_SIZE 32
  297. #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
  298. #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
  299. #define CC_PASCAL 600
  300. #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
  301. #define CC_VOLTA 700
  302. #define CC_TURING 750
  303. #define CC_AMPERE 800
  304. #define CC_OFFSET_AMD 1000000
  305. #define CC_RDNA1 (CC_OFFSET_AMD + 1010)
  306. #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
  307. #define CC_RDNA3 (CC_OFFSET_AMD + 1100)
  308. #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
  309. #if defined(_MSC_VER)
  310. #pragma warning(disable: 4244 4267) // possible loss of data
  311. #endif
  312. #define GGML_CUDA_MAX_STREAMS 8
  313. [[noreturn]]
  314. void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
  315. #define CUDA_CHECK_GEN(err, success, error_fn) \
  316. do { \
  317. auto err_ = (err); \
  318. if (err_ != (success)) { \
  319. ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
  320. } \
  321. } while (0)
  322. #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
  323. #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
  324. static const char * cublas_get_error_str(const cublasStatus_t err) {
  325. #ifndef GGML_USE_MUSA
  326. return cublasGetStatusString(err);
  327. #else
  328. return mublasStatus_to_string(err);
  329. #endif // GGML_USE_MUSA
  330. }
  331. #else
  332. static const char * cublas_get_error_str(const cublasStatus_t err) {
  333. switch (err) {
  334. case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
  335. case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
  336. case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
  337. case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
  338. case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
  339. case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
  340. case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
  341. case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
  342. case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
  343. default: return "unknown error";
  344. }
  345. }
  346. #endif // CUDART_VERSION >= 12000
  347. #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
  348. #if !defined(GGML_USE_HIPBLAS)
  349. static const char * cu_get_error_str(CUresult err) {
  350. const char * err_str;
  351. cuGetErrorString(err, &err_str);
  352. return err_str;
  353. }
  354. #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
  355. #endif
  356. #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
  357. #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
  358. #else
  359. #define GGML_CUDA_ASSUME(x)
  360. #endif // CUDART_VERSION >= 11100
  361. #ifdef GGML_CUDA_F16
  362. typedef half dfloat; // dequantize float
  363. typedef half2 dfloat2;
  364. #else
  365. typedef float dfloat; // dequantize float
  366. typedef float2 dfloat2;
  367. #endif //GGML_CUDA_F16
  368. #if defined(GGML_USE_MUSA)
  369. #ifndef __has_builtin
  370. #define __has_builtin(x) 0
  371. #endif
  372. typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
  373. static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
  374. return __vsubss4(a, b);
  375. }
  376. static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
  377. const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
  378. const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
  379. unsigned int c;
  380. uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
  381. #pragma unroll
  382. for (int i = 0; i < 4; ++i) {
  383. vc[i] = va[i] == vb[i] ? 0xff : 0x00;
  384. }
  385. return c;
  386. }
  387. static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
  388. const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
  389. const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
  390. unsigned int c;
  391. uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
  392. #pragma unroll
  393. for (int i = 0; i < 4; ++i) {
  394. vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
  395. }
  396. return c;
  397. }
  398. #endif // defined(GGML_USE_MUSA)
  399. #if defined(GGML_USE_HIPBLAS)
  400. #define __CUDA_ARCH__ 1300
  401. #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
  402. defined(__gfx1150__) || defined(__gfx1151__)
  403. #define RDNA3
  404. #endif
  405. #if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
  406. defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
  407. #define RDNA2
  408. #endif
  409. #if defined(__gfx1010__) || defined(__gfx1012__)
  410. #define RDNA1
  411. #endif
  412. #ifndef __has_builtin
  413. #define __has_builtin(x) 0
  414. #endif
  415. typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
  416. typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
  417. static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
  418. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  419. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  420. #if __has_builtin(__builtin_elementwise_sub_sat)
  421. const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
  422. return reinterpret_cast<const int &>(c);
  423. #else
  424. int8x4_t c;
  425. int16_t tmp;
  426. #pragma unroll
  427. for (int i = 0; i < 4; i++) {
  428. tmp = va[i] - vb[i];
  429. if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
  430. if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
  431. c[i] = tmp;
  432. }
  433. return reinterpret_cast<int &>(c);
  434. #endif // __has_builtin(__builtin_elementwise_sub_sat)
  435. }
  436. static __device__ __forceinline__ int __vsub4(const int a, const int b) {
  437. return __vsubss4(a, b);
  438. }
  439. static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
  440. const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
  441. const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
  442. unsigned int c;
  443. uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
  444. #pragma unroll
  445. for (int i = 0; i < 4; ++i) {
  446. vc[i] = va[i] == vb[i] ? 0xff : 0x00;
  447. }
  448. return c;
  449. }
  450. static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
  451. const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
  452. const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
  453. unsigned int c;
  454. uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
  455. #pragma unroll
  456. for (int i = 0; i < 4; ++i) {
  457. vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
  458. }
  459. return c;
  460. }
  461. #if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
  462. // __shfl_xor() for half2 was added in ROCm 5.6
  463. static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
  464. typedef union half2_b32 {
  465. half2 val;
  466. int b32;
  467. } half2_b32_t;
  468. half2_b32_t tmp;
  469. tmp.val = var;
  470. tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
  471. return tmp.val;
  472. }
  473. #endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
  474. #endif // defined(GGML_USE_HIPBLAS)
  475. #if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
  476. #define FP16_AVAILABLE
  477. #endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
  478. #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
  479. #define FAST_FP16_AVAILABLE
  480. #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
  481. #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
  482. #define FP16_MMA_AVAILABLE
  483. #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
  484. #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
  485. #define INT8_MMA_AVAILABLE
  486. #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
  487. static constexpr bool fast_fp16_available(const int cc) {
  488. return cc >= CC_PASCAL && cc != 610;
  489. }
  490. static constexpr bool fp16_mma_available(const int cc) {
  491. return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
  492. }
  493. static constexpr bool int8_mma_available(const int cc) {
  494. return cc < CC_OFFSET_AMD && cc >= CC_TURING;
  495. }
  496. [[noreturn]]
  497. static __device__ void no_device_code(
  498. const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
  499. #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  500. printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
  501. file_name, line, function_name, arch);
  502. GGML_UNUSED(arch_list);
  503. #else
  504. printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
  505. file_name, line, function_name, arch, arch_list);
  506. #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  507. __trap();
  508. GGML_UNUSED(no_device_code); // suppress unused function warning
  509. }
  510. #ifdef __CUDA_ARCH__
  511. #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
  512. #else
  513. #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
  514. #endif // __CUDA_ARCH__
  515. static __device__ __forceinline__ float warp_reduce_sum(float x) {
  516. #pragma unroll
  517. for (int mask = 16; mask > 0; mask >>= 1) {
  518. x += __shfl_xor_sync(0xffffffff, x, mask, 32);
  519. }
  520. return x;
  521. }
  522. static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
  523. #pragma unroll
  524. for (int mask = 16; mask > 0; mask >>= 1) {
  525. a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
  526. a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
  527. }
  528. return a;
  529. }
  530. static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
  531. #ifdef FP16_AVAILABLE
  532. #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  533. #pragma unroll
  534. for (int mask = 16; mask > 0; mask >>= 1) {
  535. const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
  536. reinterpret_cast<half&>(a.x) += __low2half(a_other);
  537. reinterpret_cast<half&>(a.y) += __high2half(a_other);
  538. }
  539. return a;
  540. #else
  541. #pragma unroll
  542. for (int mask = 16; mask > 0; mask >>= 1) {
  543. a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
  544. }
  545. return a;
  546. #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  547. #else
  548. NO_DEVICE_CODE;
  549. return a;
  550. #endif // FP16_AVAILABLE
  551. }
  552. static __device__ __forceinline__ float warp_reduce_max(float x) {
  553. #pragma unroll
  554. for (int mask = 16; mask > 0; mask >>= 1) {
  555. x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
  556. }
  557. return x;
  558. }
  559. static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
  560. #ifdef FP16_AVAILABLE
  561. #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
  562. return __float2half(fmaxf(__half2float(a), __half2float(b)));
  563. #else
  564. return __hmax(a, b);
  565. #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
  566. #else
  567. NO_DEVICE_CODE;
  568. GGML_UNUSED(b);
  569. return a;
  570. #endif // FP16_AVAILABLE
  571. }
  572. static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
  573. #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
  574. #if CUDART_VERSION >= CUDART_HMAX
  575. return __hmax2(a, b);
  576. #else
  577. half2 ret;
  578. reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
  579. reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
  580. return ret;
  581. #endif // CUDART_VERSION >= CUDART_HMAX
  582. #else
  583. GGML_UNUSED(a);
  584. GGML_UNUSED(b);
  585. NO_DEVICE_CODE;
  586. #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
  587. }
  588. static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
  589. #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
  590. #pragma unroll
  591. for (int mask = 16; mask > 0; mask >>= 1) {
  592. x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
  593. }
  594. return x;
  595. #else
  596. GGML_UNUSED(x);
  597. NO_DEVICE_CODE;
  598. #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
  599. }
  600. #if CUDART_VERSION < CUDART_HMASK
  601. static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
  602. const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
  603. const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
  604. return mask_low | mask_high;
  605. }
  606. #endif // CUDART_VERSION < CUDART_HMASK
  607. static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
  608. #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  609. #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
  610. c = __builtin_amdgcn_sdot4(a, b, c, false);
  611. #elif defined(RDNA3)
  612. c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
  613. #elif defined(__gfx1010__) || defined(__gfx900__)
  614. int tmp1;
  615. int tmp2;
  616. asm("\n \
  617. v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
  618. v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
  619. v_add3_u32 %0, %1, %2, %0 \n \
  620. v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
  621. v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
  622. v_add3_u32 %0, %1, %2, %0 \n \
  623. "
  624. : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
  625. : "v"(a), "v"(b)
  626. );
  627. #else
  628. const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
  629. const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
  630. c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
  631. #endif
  632. return c;
  633. #else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  634. #if __CUDA_ARCH__ >= MIN_CC_DP4A
  635. return __dp4a(a, b, c);
  636. #else // __CUDA_ARCH__ >= MIN_CC_DP4A
  637. const int8_t * a8 = (const int8_t *) &a;
  638. const int8_t * b8 = (const int8_t *) &b;
  639. return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
  640. #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
  641. #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
  642. }
  643. // TODO: move to ggml-common.h
  644. static constexpr __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
  645. typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v);
  646. static __device__ __forceinline__ float get_alibi_slope(
  647. const float max_bias, const uint32_t h, const uint32_t n_head_log2, const float m0, const float m1
  648. ) {
  649. if (max_bias <= 0.0f) {
  650. return 1.0f;
  651. }
  652. const float base = h < n_head_log2 ? m0 : m1;
  653. const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
  654. return powf(base, exph);
  655. }
  656. template <ggml_type type>
  657. struct ggml_cuda_type_traits;
  658. template<>
  659. struct ggml_cuda_type_traits<GGML_TYPE_F16> {
  660. static constexpr int qk = 1;
  661. static constexpr int qr = 1;
  662. };
  663. template<>
  664. struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
  665. static constexpr int qk = QK4_0;
  666. static constexpr int qr = QR4_0;
  667. static constexpr int qi = QI4_0;
  668. };
  669. template<>
  670. struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
  671. static constexpr int qk = QK4_1;
  672. static constexpr int qr = QR4_1;
  673. static constexpr int qi = QI4_1;
  674. };
  675. template<>
  676. struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
  677. static constexpr int qk = QK5_0;
  678. static constexpr int qr = QR5_0;
  679. static constexpr int qi = QI5_0;
  680. };
  681. template<>
  682. struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
  683. static constexpr int qk = QK5_1;
  684. static constexpr int qr = QR5_1;
  685. static constexpr int qi = QI5_1;
  686. };
  687. template<>
  688. struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
  689. static constexpr int qk = QK8_0;
  690. static constexpr int qr = QR8_0;
  691. static constexpr int qi = QI8_0;
  692. };
  693. template<>
  694. struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
  695. static constexpr int qk = QK_K;
  696. static constexpr int qr = QR2_K;
  697. static constexpr int qi = QI2_K;
  698. };
  699. template<>
  700. struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
  701. static constexpr int qk = QK_K;
  702. static constexpr int qr = QR3_K;
  703. static constexpr int qi = QI3_K;
  704. };
  705. template<>
  706. struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
  707. static constexpr int qk = QK_K;
  708. static constexpr int qr = QR4_K;
  709. static constexpr int qi = QI4_K;
  710. };
  711. template<>
  712. struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
  713. static constexpr int qk = QK_K;
  714. static constexpr int qr = QR5_K;
  715. static constexpr int qi = QI5_K;
  716. };
  717. template<>
  718. struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
  719. static constexpr int qk = QK_K;
  720. static constexpr int qr = QR6_K;
  721. static constexpr int qi = QI6_K;
  722. };
  723. template<>
  724. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
  725. static constexpr int qk = QK_K;
  726. static constexpr int qr = QR2_XXS;
  727. static constexpr int qi = QI2_XXS;
  728. };
  729. template<>
  730. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
  731. static constexpr int qk = QK_K;
  732. static constexpr int qr = QR2_XS;
  733. static constexpr int qi = QI2_XS;
  734. };
  735. template<>
  736. struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
  737. static constexpr int qk = QK_K;
  738. static constexpr int qr = QR2_S;
  739. static constexpr int qi = QI2_S;
  740. };
  741. template<>
  742. struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
  743. static constexpr int qk = QK_K;
  744. static constexpr int qr = QR3_XXS;
  745. static constexpr int qi = QI3_XXS;
  746. };
  747. template<>
  748. struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
  749. static constexpr int qk = QK_K;
  750. static constexpr int qr = QR1_S;
  751. static constexpr int qi = QI1_S;
  752. };
  753. template<>
  754. struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
  755. static constexpr int qk = QK_K;
  756. static constexpr int qr = QR1_M;
  757. static constexpr int qi = QI1_M;
  758. };
  759. template<>
  760. struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
  761. static constexpr int qk = QK4_NL;
  762. static constexpr int qr = QR4_NL;
  763. static constexpr int qi = QI4_NL;
  764. };
  765. template<>
  766. struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
  767. static constexpr int qk = QK_K;
  768. static constexpr int qr = QR4_XS;
  769. static constexpr int qi = QI4_XS;
  770. };
  771. template<>
  772. struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
  773. static constexpr int qk = QK_K;
  774. static constexpr int qr = QR3_S;
  775. static constexpr int qi = QI3_S;
  776. };
  777. //////////////////////
  778. struct ggml_cuda_device_info {
  779. int device_count;
  780. struct cuda_device_info {
  781. int cc; // compute capability
  782. int nsm; // number of streaming multiprocessors
  783. size_t smpb; // max. shared memory per block
  784. size_t smpbo; // max. shared memory per block (with opt-in)
  785. bool vmm; // virtual memory support
  786. size_t vmm_granularity; // granularity of virtual memory
  787. size_t total_vram;
  788. };
  789. cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {};
  790. std::array<float, GGML_CUDA_MAX_DEVICES> default_tensor_split = {};
  791. };
  792. const ggml_cuda_device_info & ggml_cuda_info();
  793. void ggml_cuda_set_device(int device);
  794. int ggml_cuda_get_device();
  795. struct ggml_cuda_pool {
  796. virtual ~ggml_cuda_pool() = default;
  797. virtual void * alloc(size_t size, size_t * actual_size) = 0;
  798. virtual void free(void * ptr, size_t size) = 0;
  799. };
  800. template<typename T>
  801. struct ggml_cuda_pool_alloc {
  802. ggml_cuda_pool * pool = nullptr;
  803. T * ptr = nullptr;
  804. size_t actual_size = 0;
  805. ggml_cuda_pool_alloc() = default;
  806. explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) {
  807. }
  808. ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) {
  809. alloc(size);
  810. }
  811. ~ggml_cuda_pool_alloc() {
  812. if (ptr != nullptr) {
  813. pool->free(ptr, actual_size);
  814. }
  815. }
  816. // size is in number of elements
  817. T * alloc(size_t size) {
  818. GGML_ASSERT(pool != nullptr);
  819. GGML_ASSERT(ptr == nullptr);
  820. ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
  821. return ptr;
  822. }
  823. T * alloc(ggml_cuda_pool & pool, size_t size) {
  824. this->pool = &pool;
  825. return alloc(size);
  826. }
  827. T * get() {
  828. return ptr;
  829. }
  830. ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete;
  831. ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete;
  832. ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete;
  833. ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete;
  834. };
  835. // backend interface
  836. struct ggml_tensor_extra_gpu {
  837. void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
  838. cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
  839. };
  840. #if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
  841. #define USE_CUDA_GRAPH
  842. #endif
  843. struct ggml_graph_node_properties {
  844. void * node_address;
  845. ggml_op node_op;
  846. int64_t ne[GGML_MAX_DIMS];
  847. size_t nb[GGML_MAX_DIMS];
  848. void * src_address[GGML_MAX_SRC];
  849. };
  850. struct ggml_cuda_graph {
  851. #ifdef USE_CUDA_GRAPH
  852. ~ggml_cuda_graph() {
  853. if (instance != nullptr) {
  854. CUDA_CHECK(cudaGraphExecDestroy(instance));
  855. }
  856. if (graph != nullptr) {
  857. CUDA_CHECK(cudaGraphDestroy(graph));
  858. }
  859. }
  860. cudaGraph_t graph = nullptr;
  861. cudaGraphExec_t instance = nullptr;
  862. size_t num_nodes = 0;
  863. std::vector<cudaGraphNode_t> nodes;
  864. std::vector<cudaKernelNodeParams> params;
  865. bool disable_due_to_gpu_arch = false;
  866. bool disable_due_to_too_many_updates = false;
  867. bool disable_due_to_failed_graph_capture = false;
  868. int number_consecutive_updates = 0;
  869. std::vector<ggml_graph_node_properties> ggml_graph_properties;
  870. std::vector<char **> updated_kernel_arg;
  871. #endif
  872. };
  873. struct ggml_backend_cuda_context {
  874. int device;
  875. std::string name;
  876. cudaEvent_t copy_event = nullptr;
  877. cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
  878. cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
  879. std::unique_ptr<ggml_cuda_graph> cuda_graph;
  880. explicit ggml_backend_cuda_context(int device) :
  881. device(device),
  882. name(GGML_CUDA_NAME + std::to_string(device)) {
  883. }
  884. ~ggml_backend_cuda_context() {
  885. if (copy_event != nullptr) {
  886. CUDA_CHECK(cudaEventDestroy(copy_event));
  887. }
  888. for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
  889. for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
  890. if (streams[i][j] != nullptr) {
  891. CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
  892. }
  893. }
  894. if (cublas_handles[i] != nullptr) {
  895. CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
  896. }
  897. }
  898. }
  899. cudaStream_t stream(int device, int stream) {
  900. if (streams[device][stream] == nullptr) {
  901. ggml_cuda_set_device(device);
  902. CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
  903. }
  904. return streams[device][stream];
  905. }
  906. cudaStream_t stream() {
  907. return stream(device, 0);
  908. }
  909. cublasHandle_t cublas_handle(int device) {
  910. if (cublas_handles[device] == nullptr) {
  911. ggml_cuda_set_device(device);
  912. CUBLAS_CHECK(cublasCreate(&cublas_handles[device]));
  913. CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH));
  914. }
  915. return cublas_handles[device];
  916. }
  917. cublasHandle_t cublas_handle() {
  918. return cublas_handle(device);
  919. }
  920. // pool
  921. std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
  922. static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
  923. ggml_cuda_pool & pool(int device) {
  924. if (pools[device] == nullptr) {
  925. pools[device] = new_pool_for_device(device);
  926. }
  927. return *pools[device];
  928. }
  929. ggml_cuda_pool & pool() {
  930. return pool(device);
  931. }
  932. };