8 months ago · b267ab92b0
--- a/llama/ggml-cuda/vendors/cuda.h
+++ b/llama/ggml-cuda/vendors/cuda.h
@@ -0,0 +1,40 @@
 
															+/**
														
 
															+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
														
 
															+ *
														
 
															+ * MIT License
														
 
															+ *
														
 
															+ * Copyright (c) 2023-2024 The ggml authors
														
 
															+ *
														
 
															+ * Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+ * of this software and associated documentation files (the "Software"), to deal
														
 
															+ * in the Software without restriction, including without limitation the rights
														
 
															+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+ * copies of the Software, and to permit persons to whom the Software is
														
 
															+ * furnished to do so, subject to the following conditions:
														
 
															+ *
														
 
															+ * The above copyright notice and this permission notice shall be included in all
														
 
															+ * copies or substantial portions of the Software.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+#pragma once
														
 
															+
														
 
															+#include <cuda_runtime.h>
														
 
															+#include <cuda.h>
														
 
															+#include <cublas_v2.h>
														
 
															+#include <cuda_fp16.h>
														
 
															+
														
 
															+#if CUDART_VERSION < 11020
														
 
															+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
														
 
															+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
														
 
															+#define CUBLAS_COMPUTE_16F CUDA_R_16F
														
 
															+#define CUBLAS_COMPUTE_32F CUDA_R_32F
														
 
															+#define cublasComputeType_t cudaDataType_t
														
 
															+#endif // CUDART_VERSION < 11020
														
--- a/llama/ggml-cuda/vendors/hip.h
+++ b/llama/ggml-cuda/vendors/hip.h
@@ -0,0 +1,203 @@
 
															+/**
														
 
															+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
														
 
															+ *
														
 
															+ * MIT License
														
 
															+ *
														
 
															+ * Copyright (c) 2023-2024 The ggml authors
														
 
															+ *
														
 
															+ * Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+ * of this software and associated documentation files (the "Software"), to deal
														
 
															+ * in the Software without restriction, including without limitation the rights
														
 
															+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+ * copies of the Software, and to permit persons to whom the Software is
														
 
															+ * furnished to do so, subject to the following conditions:
														
 
															+ *
														
 
															+ * The above copyright notice and this permission notice shall be included in all
														
 
															+ * copies or substantial portions of the Software.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+#pragma once
														
 
															+
														
 
															+#include <hip/hip_runtime.h>
														
 
															+#include <hipblas/hipblas.h>
														
 
															+#include <hip/hip_fp16.h>
														
 
															+#ifdef __HIP_PLATFORM_AMD__
														
 
															+// for rocblas_initialize()
														
 
															+#include "rocblas/rocblas.h"
														
 
															+#endif // __HIP_PLATFORM_AMD__
														
 
															+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
														
 
															+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
														
 
															+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
														
 
															+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
														
 
															+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
														
 
															+#define CUBLAS_OP_N HIPBLAS_OP_N
														
 
															+#define CUBLAS_OP_T HIPBLAS_OP_T
														
 
															+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
														
 
															+#define CUBLAS_TF32_TENSOR_OP_MATH 0
														
 
															+#define CUDA_R_16F  HIPBLAS_R_16F
														
 
															+#define CUDA_R_32F  HIPBLAS_R_32F
														
 
															+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
														
 
															+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
														
 
															+#define cublasCreate hipblasCreate
														
 
															+#define cublasDestroy hipblasDestroy
														
 
															+#define cublasGemmEx hipblasGemmEx
														
 
															+#define cublasGemmBatchedEx hipblasGemmBatchedEx
														
 
															+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
														
 
															+#define cublasHandle_t hipblasHandle_t
														
 
															+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
														
 
															+#define cublasSetStream hipblasSetStream
														
 
															+#define cublasSgemm hipblasSgemm
														
 
															+#define cublasStatus_t hipblasStatus_t
														
 
															+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
														
 
															+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
														
 
															+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
														
 
															+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
														
 
															+#define cudaDeviceProp hipDeviceProp_t
														
 
															+#define cudaDeviceSynchronize hipDeviceSynchronize
														
 
															+#define cudaError_t hipError_t
														
 
															+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
														
 
															+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
														
 
															+#define cudaEventCreateWithFlags hipEventCreateWithFlags
														
 
															+#define cudaEventDisableTiming hipEventDisableTiming
														
 
															+#define cudaEventRecord hipEventRecord
														
 
															+#define cudaEventSynchronize hipEventSynchronize
														
 
															+#define cudaEvent_t hipEvent_t
														
 
															+#define cudaEventDestroy hipEventDestroy
														
 
															+#define cudaFree hipFree
														
 
															+#define cudaFreeHost hipHostFree
														
 
															+#define cudaGetDevice hipGetDevice
														
 
															+#define cudaGetDeviceCount hipGetDeviceCount
														
 
															+#define cudaGetDeviceProperties hipGetDeviceProperties
														
 
															+#define cudaGetErrorString hipGetErrorString
														
 
															+#define cudaGetLastError hipGetLastError
														
 
															+#define cudaHostRegister hipHostRegister
														
 
															+#define cudaHostRegisterPortable hipHostRegisterPortable
														
 
															+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
														
 
															+#define cudaHostUnregister hipHostUnregister
														
 
															+#define cudaLaunchHostFunc hipLaunchHostFunc
														
 
															+#define cudaMalloc hipMalloc
														
 
															+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
														
 
															+#define cudaMemcpy hipMemcpy
														
 
															+#define cudaMemcpyAsync hipMemcpyAsync
														
 
															+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
														
 
															+#define cudaMemcpy2DAsync hipMemcpy2DAsync
														
 
															+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
														
 
															+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
														
 
															+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
														
 
															+#define cudaMemcpyKind hipMemcpyKind
														
 
															+#define cudaMemset hipMemset
														
 
															+#define cudaMemsetAsync hipMemsetAsync
														
 
															+#define cudaMemGetInfo hipMemGetInfo
														
 
															+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
														
 
															+#define cudaSetDevice hipSetDevice
														
 
															+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
														
 
															+#define cudaStreamDestroy hipStreamDestroy
														
 
															+#define cudaStreamFireAndForget hipStreamFireAndForget
														
 
															+#define cudaStreamNonBlocking hipStreamNonBlocking
														
 
															+#define cudaStreamPerThread hipStreamPerThread
														
 
															+#define cudaStreamSynchronize hipStreamSynchronize
														
 
															+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
														
 
															+#define cudaStream_t hipStream_t
														
 
															+#define cudaSuccess hipSuccess
														
 
															+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
														
 
															+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
														
 
															+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
														
 
															+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
														
 
															+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
														
 
															+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
														
 
															+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
														
 
															+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
														
 
															+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
														
 
															+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
														
 
															+
														
 
															+#define __CUDA_ARCH__ 1300
														
 
															+
														
 
															+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
														
 
															+    defined(__gfx1150__) || defined(__gfx1151__)
														
 
															+#define RDNA3
														
 
															+#endif
														
 
															+
														
 
															+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
														
 
															+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
														
 
															+#define RDNA2
														
 
															+#endif
														
 
															+
														
 
															+#if defined(__gfx1010__) || defined(__gfx1012__)
														
 
															+#define RDNA1
														
 
															+#endif
														
 
															+
														
 
															+#ifndef __has_builtin
														
 
															+    #define __has_builtin(x) 0
														
 
															+#endif
														
 
															+
														
 
															+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
														
 
															+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
														
 
															+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
														
 
															+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
														
 
															+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
														
 
															+#if __has_builtin(__builtin_elementwise_sub_sat)
														
 
															+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
														
 
															+    return reinterpret_cast<const int &>(c);
														
 
															+#else
														
 
															+    int8x4_t c;
														
 
															+    int16_t tmp;
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < 4; i++) {
														
 
															+        tmp = va[i] - vb[i];
														
 
															+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
														
 
															+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
														
 
															+        c[i] = tmp;
														
 
															+    }
														
 
															+    return reinterpret_cast<int &>(c);
														
 
															+#endif // __has_builtin(__builtin_elementwise_sub_sat)
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
														
 
															+    return __vsubss4(a, b);
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
														
 
															+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
														
 
															+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
														
 
															+    unsigned int c;
														
 
															+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < 4; ++i) {
														
 
															+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
														
 
															+    }
														
 
															+    return c;
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
														
 
															+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
														
 
															+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
														
 
															+    unsigned int c;
														
 
															+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < 4; ++i) {
														
 
															+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
														
 
															+    }
														
 
															+    return c;
														
 
															+}
														
 
															+
														
 
															+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
														
 
															+// __shfl_xor() for half2 was added in ROCm 5.6
														
 
															+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
														
 
															+    typedef union half2_b32 {
														
 
															+        half2 val;
														
 
															+        int   b32;
														
 
															+    } half2_b32_t;
														
 
															+    half2_b32_t tmp;
														
 
															+    tmp.val = var;
														
 
															+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
														
 
															+    return tmp.val;
														
 
															+}
														
 
															+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
														
--- a/llama/ggml-cuda/vendors/musa.h
+++ b/llama/ggml-cuda/vendors/musa.h
@@ -0,0 +1,197 @@
 
															+/**
														
 
															+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
														
 
															+ *
														
 
															+ * MIT License
														
 
															+ *
														
 
															+ * Copyright (c) 2023-2024 The ggml authors
														
 
															+ *
														
 
															+ * Permission is hereby granted, free of charge, to any person obtaining a copy
														
 
															+ * of this software and associated documentation files (the "Software"), to deal
														
 
															+ * in the Software without restriction, including without limitation the rights
														
 
															+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
														
 
															+ * copies of the Software, and to permit persons to whom the Software is
														
 
															+ * furnished to do so, subject to the following conditions:
														
 
															+ *
														
 
															+ * The above copyright notice and this permission notice shall be included in all
														
 
															+ * copies or substantial portions of the Software.
														
 
															+ *
														
 
															+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
														
 
															+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
														
 
															+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
														
 
															+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
														
 
															+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
														
 
															+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
														
 
															+ * SOFTWARE.
														
 
															+ */
														
 
															+
														
 
															+#pragma once
														
 
															+
														
 
															+#include <musa_runtime.h>
														
 
															+#include <musa.h>
														
 
															+#include <mublas.h>
														
 
															+#include <musa_fp16.h>
														
 
															+#define CUBLAS_COMPUTE_16F CUDA_R_16F
														
 
															+#define CUBLAS_COMPUTE_32F CUDA_R_32F
														
 
															+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
														
 
															+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
														
 
															+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
														
 
															+#define CUBLAS_OP_N MUBLAS_OP_N
														
 
															+#define CUBLAS_OP_T MUBLAS_OP_T
														
 
															+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
														
 
															+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
														
 
															+#define CUDA_R_16F  MUSA_R_16F
														
 
															+#define CUDA_R_32F  MUSA_R_32F
														
 
															+#define cublasComputeType_t cudaDataType_t
														
 
															+#define cublasCreate mublasCreate
														
 
															+#define cublasDestroy mublasDestroy
														
 
															+#define cublasGemmEx mublasGemmEx
														
 
															+#define cublasGemmBatchedEx mublasGemmBatchedEx
														
 
															+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
														
 
															+#define cublasHandle_t mublasHandle_t
														
 
															+#define cublasSetMathMode mublasSetMathMode
														
 
															+#define cublasSetStream mublasSetStream
														
 
															+#define cublasSgemm mublasSgemm
														
 
															+#define cublasStatus_t mublasStatus_t
														
 
															+#define cublasGetStatusString mublasStatus_to_string
														
 
															+#define cudaDataType_t musaDataType_t
														
 
															+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
														
 
															+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
														
 
															+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
														
 
															+#define cudaDeviceProp musaDeviceProp
														
 
															+#define cudaDeviceSynchronize musaDeviceSynchronize
														
 
															+#define cudaError_t musaError_t
														
 
															+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
														
 
															+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
														
 
															+#define cudaEventCreateWithFlags musaEventCreateWithFlags
														
 
															+#define cudaEventDisableTiming musaEventDisableTiming
														
 
															+#define cudaEventRecord musaEventRecord
														
 
															+#define cudaEventSynchronize musaEventSynchronize
														
 
															+#define cudaEvent_t musaEvent_t
														
 
															+#define cudaEventDestroy musaEventDestroy
														
 
															+#define cudaFree musaFree
														
 
															+#define cudaFreeHost musaFreeHost
														
 
															+#define cudaGetDevice musaGetDevice
														
 
															+#define cudaGetDeviceCount musaGetDeviceCount
														
 
															+#define cudaGetDeviceProperties musaGetDeviceProperties
														
 
															+#define cudaGetErrorString musaGetErrorString
														
 
															+#define cudaGetLastError musaGetLastError
														
 
															+#define cudaHostRegister musaHostRegister
														
 
															+#define cudaHostRegisterPortable musaHostRegisterPortable
														
 
															+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
														
 
															+#define cudaHostUnregister musaHostUnregister
														
 
															+#define cudaLaunchHostFunc musaLaunchHostFunc
														
 
															+#define cudaMalloc musaMalloc
														
 
															+#define cudaMallocHost musaMallocHost
														
 
															+#define cudaMemcpy musaMemcpy
														
 
															+#define cudaMemcpyAsync musaMemcpyAsync
														
 
															+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
														
 
															+#define cudaMemcpy2DAsync musaMemcpy2DAsync
														
 
															+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
														
 
															+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
														
 
															+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
														
 
															+#define cudaMemcpyKind musaMemcpyKind
														
 
															+#define cudaMemset musaMemset
														
 
															+#define cudaMemsetAsync musaMemsetAsync
														
 
															+#define cudaMemGetInfo musaMemGetInfo
														
 
															+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
														
 
															+#define cudaSetDevice musaSetDevice
														
 
															+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
														
 
															+#define cudaStreamDestroy musaStreamDestroy
														
 
															+#define cudaStreamFireAndForget musaStreamFireAndForget
														
 
															+#define cudaStreamNonBlocking musaStreamNonBlocking
														
 
															+#define cudaStreamPerThread musaStreamPerThread
														
 
															+#define cudaStreamSynchronize musaStreamSynchronize
														
 
															+#define cudaStreamWaitEvent musaStreamWaitEvent
														
 
															+#define cudaStream_t musaStream_t
														
 
															+#define cudaSuccess musaSuccess
														
 
															+
														
 
															+// Additional mappings for MUSA virtual memory pool
														
 
															+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
														
 
															+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
														
 
															+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
														
 
															+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
														
 
															+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
														
 
															+#define CUdevice MUdevice
														
 
															+#define CUdeviceptr MUdeviceptr
														
 
															+#define CUmemAccessDesc MUmemAccessDesc
														
 
															+#define CUmemAllocationProp MUmemAllocationProp
														
 
															+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
														
 
															+#define cuDeviceGet muDeviceGet
														
 
															+#define cuDeviceGetAttribute muDeviceGetAttribute
														
 
															+#define cuMemAddressFree muMemAddressFree
														
 
															+#define cuMemAddressReserve muMemAddressReserve
														
 
															+#define cuMemCreate muMemCreate
														
 
															+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
														
 
															+#define cuMemMap muMemMap
														
 
															+#define cuMemRelease muMemRelease
														
 
															+#define cuMemSetAccess muMemSetAccess
														
 
															+#define cuMemUnmap muMemUnmap
														
 
															+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
														
 
															+#define cudaFuncSetAttribute musaFuncSetAttribute
														
 
															+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
														
 
															+#define make_cudaExtent make_musaExtent
														
 
															+#define make_cudaPitchedPtr make_musaPitchedPtr
														
 
															+
														
 
															+// Additional mappings for MUSA graphs
														
 
															+#define CUDA_SUCCESS MUSA_SUCCESS
														
 
															+#define CUresult MUresult
														
 
															+#define cuGetErrorString muGetErrorString
														
 
															+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
														
 
															+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
														
 
															+#define cudaGraphDestroy musaGraphDestroy
														
 
															+#define cudaGraphExecDestroy musaGraphExecDestroy
														
 
															+#define cudaGraphExec_t musaGraphExec_t
														
 
															+#define cudaGraphExecUpdate musaGraphExecUpdate
														
 
															+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
														
 
															+#define cudaGraphGetNodes musaGraphGetNodes
														
 
															+#define cudaGraphInstantiate musaGraphInstantiate
														
 
															+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
														
 
															+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
														
 
															+#define cudaGraphLaunch musaGraphLaunch
														
 
															+#define cudaGraphNodeGetType musaGraphNodeGetType
														
 
															+#define cudaGraphNode_t musaGraphNode_t
														
 
															+#define cudaGraphNodeType musaGraphNodeType
														
 
															+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
														
 
															+#define cudaGraph_t musaGraph_t
														
 
															+#define cudaKernelNodeParams musaKernelNodeParams
														
 
															+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
														
 
															+#define cudaStreamEndCapture musaStreamEndCapture
														
 
															+
														
 
															+// XXX: Clang builtins mapping
														
 
															+#define __vsub4   __vsub4_musa
														
 
															+#define __vcmpeq4 __vcmpeq4_musa
														
 
															+#define __vcmpne4 __vcmpne4_musa
														
 
															+
														
 
															+#ifndef __has_builtin
														
 
															+    #define __has_builtin(x) 0
														
 
															+#endif
														
 
															+
														
 
															+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
														
 
															+
														
 
															+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
														
 
															+    return __vsubss4(a, b);
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
														
 
															+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
														
 
															+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
														
 
															+    unsigned int c;
														
 
															+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < 4; ++i) {
														
 
															+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
														
 
															+    }
														
 
															+    return c;
														
 
															+}
														
 
															+
														
 
															+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
														
 
															+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
														
 
															+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
														
 
															+    unsigned int c;
														
 
															+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < 4; ++i) {
														
 
															+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
														
 
															+    }
														
 
															+    return c;
														
 
															+}
														
--- a/llama/sync.sh
+++ b/llama/sync.sh
@@ -68,9 +68,11 @@ cp $src_dir/ggml/src/ggml-aarch64.c $dst_dir/ggml-aarch64.c
 
															 # ggml-cuda
														
 
															 mkdir -p $dst_dir/ggml-cuda/template-instances
														
 
															+mkdir -p $dst_dir/ggml-cuda/vendors
														
 
															 cp $src_dir/ggml/src/ggml-cuda/*.cu $dst_dir/ggml-cuda/
														
 
															 cp $src_dir/ggml/src/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
														
 
															 cp $src_dir/ggml/src/ggml-cuda/template-instances/*.cu $dst_dir/ggml-cuda/template-instances/
														
 
															+cp $src_dir/ggml/src/ggml-cuda/vendors/*.h $dst_dir/ggml-cuda/vendors/
														
 
															 # llava
														
 
															 cp $src_dir/examples/llava/clip.cpp $dst_dir/clip.cpp