há 8 meses atrás · b267ab92b0
--- a/llama/ggml-cuda/vendors/cuda.h
+++ b/llama/ggml-cuda/vendors/cuda.h
@@ -0,0 +1,40 @@
 
				+/**
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <cuda.h>
			
 
				+#include <cublas_v2.h>
			
 
				+#include <cuda_fp16.h>
			
 
				+
			
 
				+#if CUDART_VERSION < 11020
			
 
				+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
			
 
				+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
			
 
				+#define CUBLAS_COMPUTE_16F CUDA_R_16F
			
 
				+#define CUBLAS_COMPUTE_32F CUDA_R_32F
			
 
				+#define cublasComputeType_t cudaDataType_t
			
 
				+#endif // CUDART_VERSION < 11020
			
--- a/llama/ggml-cuda/vendors/hip.h
+++ b/llama/ggml-cuda/vendors/hip.h
@@ -0,0 +1,203 @@
 
				+/**
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <hip/hip_runtime.h>
			
 
				+#include <hipblas/hipblas.h>
			
 
				+#include <hip/hip_fp16.h>
			
 
				+#ifdef __HIP_PLATFORM_AMD__
			
 
				+// for rocblas_initialize()
			
 
				+#include "rocblas/rocblas.h"
			
 
				+#endif // __HIP_PLATFORM_AMD__
			
 
				+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
			
 
				+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
			
 
				+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
			
 
				+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
			
 
				+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
			
 
				+#define CUBLAS_OP_N HIPBLAS_OP_N
			
 
				+#define CUBLAS_OP_T HIPBLAS_OP_T
			
 
				+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
			
 
				+#define CUBLAS_TF32_TENSOR_OP_MATH 0
			
 
				+#define CUDA_R_16F  HIPBLAS_R_16F
			
 
				+#define CUDA_R_32F  HIPBLAS_R_32F
			
 
				+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
			
 
				+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
			
 
				+#define cublasCreate hipblasCreate
			
 
				+#define cublasDestroy hipblasDestroy
			
 
				+#define cublasGemmEx hipblasGemmEx
			
 
				+#define cublasGemmBatchedEx hipblasGemmBatchedEx
			
 
				+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
			
 
				+#define cublasHandle_t hipblasHandle_t
			
 
				+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
			
 
				+#define cublasSetStream hipblasSetStream
			
 
				+#define cublasSgemm hipblasSgemm
			
 
				+#define cublasStatus_t hipblasStatus_t
			
 
				+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
			
 
				+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
			
 
				+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
			
 
				+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
			
 
				+#define cudaDeviceProp hipDeviceProp_t
			
 
				+#define cudaDeviceSynchronize hipDeviceSynchronize
			
 
				+#define cudaError_t hipError_t
			
 
				+#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled
			
 
				+#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled
			
 
				+#define cudaEventCreateWithFlags hipEventCreateWithFlags
			
 
				+#define cudaEventDisableTiming hipEventDisableTiming
			
 
				+#define cudaEventRecord hipEventRecord
			
 
				+#define cudaEventSynchronize hipEventSynchronize
			
 
				+#define cudaEvent_t hipEvent_t
			
 
				+#define cudaEventDestroy hipEventDestroy
			
 
				+#define cudaFree hipFree
			
 
				+#define cudaFreeHost hipHostFree
			
 
				+#define cudaGetDevice hipGetDevice
			
 
				+#define cudaGetDeviceCount hipGetDeviceCount
			
 
				+#define cudaGetDeviceProperties hipGetDeviceProperties
			
 
				+#define cudaGetErrorString hipGetErrorString
			
 
				+#define cudaGetLastError hipGetLastError
			
 
				+#define cudaHostRegister hipHostRegister
			
 
				+#define cudaHostRegisterPortable hipHostRegisterPortable
			
 
				+#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
			
 
				+#define cudaHostUnregister hipHostUnregister
			
 
				+#define cudaLaunchHostFunc hipLaunchHostFunc
			
 
				+#define cudaMalloc hipMalloc
			
 
				+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
			
 
				+#define cudaMemcpy hipMemcpy
			
 
				+#define cudaMemcpyAsync hipMemcpyAsync
			
 
				+#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
			
 
				+#define cudaMemcpy2DAsync hipMemcpy2DAsync
			
 
				+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
			
 
				+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
			
 
				+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
			
 
				+#define cudaMemcpyKind hipMemcpyKind
			
 
				+#define cudaMemset hipMemset
			
 
				+#define cudaMemsetAsync hipMemsetAsync
			
 
				+#define cudaMemGetInfo hipMemGetInfo
			
 
				+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
			
 
				+#define cudaSetDevice hipSetDevice
			
 
				+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
			
 
				+#define cudaStreamDestroy hipStreamDestroy
			
 
				+#define cudaStreamFireAndForget hipStreamFireAndForget
			
 
				+#define cudaStreamNonBlocking hipStreamNonBlocking
			
 
				+#define cudaStreamPerThread hipStreamPerThread
			
 
				+#define cudaStreamSynchronize hipStreamSynchronize
			
 
				+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
			
 
				+#define cudaStream_t hipStream_t
			
 
				+#define cudaSuccess hipSuccess
			
 
				+#define __trap() do { abort(); __builtin_unreachable(); } while(0)
			
 
				+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
			
 
				+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
			
 
				+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
			
 
				+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
			
 
				+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
			
 
				+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
			
 
				+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
			
 
				+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
			
 
				+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
			
 
				+
			
 
				+#define __CUDA_ARCH__ 1300
			
 
				+
			
 
				+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
			
 
				+    defined(__gfx1150__) || defined(__gfx1151__)
			
 
				+#define RDNA3
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
			
 
				+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
			
 
				+#define RDNA2
			
 
				+#endif
			
 
				+
			
 
				+#if defined(__gfx1010__) || defined(__gfx1012__)
			
 
				+#define RDNA1
			
 
				+#endif
			
 
				+
			
 
				+#ifndef __has_builtin
			
 
				+    #define __has_builtin(x) 0
			
 
				+#endif
			
 
				+
			
 
				+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
			
 
				+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
			
 
				+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
			
 
				+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
			
 
				+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
			
 
				+#if __has_builtin(__builtin_elementwise_sub_sat)
			
 
				+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
			
 
				+    return reinterpret_cast<const int &>(c);
			
 
				+#else
			
 
				+    int8x4_t c;
			
 
				+    int16_t tmp;
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < 4; i++) {
			
 
				+        tmp = va[i] - vb[i];
			
 
				+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
			
 
				+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
			
 
				+        c[i] = tmp;
			
 
				+    }
			
 
				+    return reinterpret_cast<int &>(c);
			
 
				+#endif // __has_builtin(__builtin_elementwise_sub_sat)
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ int __vsub4(const int a, const int b) {
			
 
				+    return __vsubss4(a, b);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) {
			
 
				+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				+    unsigned int c;
			
 
				+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
			
 
				+    }
			
 
				+    return c;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ unsigned int __vcmpne4(unsigned int a, unsigned int b) {
			
 
				+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				+    unsigned int c;
			
 
				+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
			
 
				+    }
			
 
				+    return c;
			
 
				+}
			
 
				+
			
 
				+#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
			
 
				+// __shfl_xor() for half2 was added in ROCm 5.6
			
 
				+static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int width) {
			
 
				+    typedef union half2_b32 {
			
 
				+        half2 val;
			
 
				+        int   b32;
			
 
				+    } half2_b32_t;
			
 
				+    half2_b32_t tmp;
			
 
				+    tmp.val = var;
			
 
				+    tmp.b32 = __shfl_xor(tmp.b32, laneMask, width);
			
 
				+    return tmp.val;
			
 
				+}
			
 
				+#endif // defined(__HIP_PLATFORM_AMD__) && HIP_VERSION < 50600000
			
--- a/llama/ggml-cuda/vendors/musa.h
+++ b/llama/ggml-cuda/vendors/musa.h
@@ -0,0 +1,197 @@
 
				+/**
			
 
				+ * llama.cpp - commit 1e6f6554aa11fa10160a5fda689e736c3c34169f - do not edit this file
			
 
				+ *
			
 
				+ * MIT License
			
 
				+ *
			
 
				+ * Copyright (c) 2023-2024 The ggml authors
			
 
				+ *
			
 
				+ * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+ * of this software and associated documentation files (the "Software"), to deal
			
 
				+ * in the Software without restriction, including without limitation the rights
			
 
				+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+ * copies of the Software, and to permit persons to whom the Software is
			
 
				+ * furnished to do so, subject to the following conditions:
			
 
				+ *
			
 
				+ * The above copyright notice and this permission notice shall be included in all
			
 
				+ * copies or substantial portions of the Software.
			
 
				+ *
			
 
				+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+ * SOFTWARE.
			
 
				+ */
			
 
				+
			
 
				+#pragma once
			
 
				+
			
 
				+#include <musa_runtime.h>
			
 
				+#include <musa.h>
			
 
				+#include <mublas.h>
			
 
				+#include <musa_fp16.h>
			
 
				+#define CUBLAS_COMPUTE_16F CUDA_R_16F
			
 
				+#define CUBLAS_COMPUTE_32F CUDA_R_32F
			
 
				+#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
			
 
				+#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
			
 
				+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
			
 
				+#define CUBLAS_OP_N MUBLAS_OP_N
			
 
				+#define CUBLAS_OP_T MUBLAS_OP_T
			
 
				+#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
			
 
				+#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
			
 
				+#define CUDA_R_16F  MUSA_R_16F
			
 
				+#define CUDA_R_32F  MUSA_R_32F
			
 
				+#define cublasComputeType_t cudaDataType_t
			
 
				+#define cublasCreate mublasCreate
			
 
				+#define cublasDestroy mublasDestroy
			
 
				+#define cublasGemmEx mublasGemmEx
			
 
				+#define cublasGemmBatchedEx mublasGemmBatchedEx
			
 
				+#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
			
 
				+#define cublasHandle_t mublasHandle_t
			
 
				+#define cublasSetMathMode mublasSetMathMode
			
 
				+#define cublasSetStream mublasSetStream
			
 
				+#define cublasSgemm mublasSgemm
			
 
				+#define cublasStatus_t mublasStatus_t
			
 
				+#define cublasGetStatusString mublasStatus_to_string
			
 
				+#define cudaDataType_t musaDataType_t
			
 
				+#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
			
 
				+#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
			
 
				+#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
			
 
				+#define cudaDeviceProp musaDeviceProp
			
 
				+#define cudaDeviceSynchronize musaDeviceSynchronize
			
 
				+#define cudaError_t musaError_t
			
 
				+#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
			
 
				+#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
			
 
				+#define cudaEventCreateWithFlags musaEventCreateWithFlags
			
 
				+#define cudaEventDisableTiming musaEventDisableTiming
			
 
				+#define cudaEventRecord musaEventRecord
			
 
				+#define cudaEventSynchronize musaEventSynchronize
			
 
				+#define cudaEvent_t musaEvent_t
			
 
				+#define cudaEventDestroy musaEventDestroy
			
 
				+#define cudaFree musaFree
			
 
				+#define cudaFreeHost musaFreeHost
			
 
				+#define cudaGetDevice musaGetDevice
			
 
				+#define cudaGetDeviceCount musaGetDeviceCount
			
 
				+#define cudaGetDeviceProperties musaGetDeviceProperties
			
 
				+#define cudaGetErrorString musaGetErrorString
			
 
				+#define cudaGetLastError musaGetLastError
			
 
				+#define cudaHostRegister musaHostRegister
			
 
				+#define cudaHostRegisterPortable musaHostRegisterPortable
			
 
				+#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
			
 
				+#define cudaHostUnregister musaHostUnregister
			
 
				+#define cudaLaunchHostFunc musaLaunchHostFunc
			
 
				+#define cudaMalloc musaMalloc
			
 
				+#define cudaMallocHost musaMallocHost
			
 
				+#define cudaMemcpy musaMemcpy
			
 
				+#define cudaMemcpyAsync musaMemcpyAsync
			
 
				+#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
			
 
				+#define cudaMemcpy2DAsync musaMemcpy2DAsync
			
 
				+#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
			
 
				+#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
			
 
				+#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
			
 
				+#define cudaMemcpyKind musaMemcpyKind
			
 
				+#define cudaMemset musaMemset
			
 
				+#define cudaMemsetAsync musaMemsetAsync
			
 
				+#define cudaMemGetInfo musaMemGetInfo
			
 
				+#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
			
 
				+#define cudaSetDevice musaSetDevice
			
 
				+#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
			
 
				+#define cudaStreamDestroy musaStreamDestroy
			
 
				+#define cudaStreamFireAndForget musaStreamFireAndForget
			
 
				+#define cudaStreamNonBlocking musaStreamNonBlocking
			
 
				+#define cudaStreamPerThread musaStreamPerThread
			
 
				+#define cudaStreamSynchronize musaStreamSynchronize
			
 
				+#define cudaStreamWaitEvent musaStreamWaitEvent
			
 
				+#define cudaStream_t musaStream_t
			
 
				+#define cudaSuccess musaSuccess
			
 
				+
			
 
				+// Additional mappings for MUSA virtual memory pool
			
 
				+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
			
 
				+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
			
 
				+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
			
 
				+#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
			
 
				+#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
			
 
				+#define CUdevice MUdevice
			
 
				+#define CUdeviceptr MUdeviceptr
			
 
				+#define CUmemAccessDesc MUmemAccessDesc
			
 
				+#define CUmemAllocationProp MUmemAllocationProp
			
 
				+#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
			
 
				+#define cuDeviceGet muDeviceGet
			
 
				+#define cuDeviceGetAttribute muDeviceGetAttribute
			
 
				+#define cuMemAddressFree muMemAddressFree
			
 
				+#define cuMemAddressReserve muMemAddressReserve
			
 
				+#define cuMemCreate muMemCreate
			
 
				+#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
			
 
				+#define cuMemMap muMemMap
			
 
				+#define cuMemRelease muMemRelease
			
 
				+#define cuMemSetAccess muMemSetAccess
			
 
				+#define cuMemUnmap muMemUnmap
			
 
				+#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
			
 
				+#define cudaFuncSetAttribute musaFuncSetAttribute
			
 
				+#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
			
 
				+#define make_cudaExtent make_musaExtent
			
 
				+#define make_cudaPitchedPtr make_musaPitchedPtr
			
 
				+
			
 
				+// Additional mappings for MUSA graphs
			
 
				+#define CUDA_SUCCESS MUSA_SUCCESS
			
 
				+#define CUresult MUresult
			
 
				+#define cuGetErrorString muGetErrorString
			
 
				+#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
			
 
				+#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
			
 
				+#define cudaGraphDestroy musaGraphDestroy
			
 
				+#define cudaGraphExecDestroy musaGraphExecDestroy
			
 
				+#define cudaGraphExec_t musaGraphExec_t
			
 
				+#define cudaGraphExecUpdate musaGraphExecUpdate
			
 
				+#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
			
 
				+#define cudaGraphGetNodes musaGraphGetNodes
			
 
				+#define cudaGraphInstantiate musaGraphInstantiate
			
 
				+#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
			
 
				+#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
			
 
				+#define cudaGraphLaunch musaGraphLaunch
			
 
				+#define cudaGraphNodeGetType musaGraphNodeGetType
			
 
				+#define cudaGraphNode_t musaGraphNode_t
			
 
				+#define cudaGraphNodeType musaGraphNodeType
			
 
				+#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
			
 
				+#define cudaGraph_t musaGraph_t
			
 
				+#define cudaKernelNodeParams musaKernelNodeParams
			
 
				+#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
			
 
				+#define cudaStreamEndCapture musaStreamEndCapture
			
 
				+
			
 
				+// XXX: Clang builtins mapping
			
 
				+#define __vsub4   __vsub4_musa
			
 
				+#define __vcmpeq4 __vcmpeq4_musa
			
 
				+#define __vcmpne4 __vcmpne4_musa
			
 
				+
			
 
				+#ifndef __has_builtin
			
 
				+    #define __has_builtin(x) 0
			
 
				+#endif
			
 
				+
			
 
				+typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
			
 
				+
			
 
				+static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
			
 
				+    return __vsubss4(a, b);
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
			
 
				+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				+    unsigned int c;
			
 
				+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
			
 
				+    }
			
 
				+    return c;
			
 
				+}
			
 
				+
			
 
				+static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
			
 
				+    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
			
 
				+    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
			
 
				+    unsigned int c;
			
 
				+    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
			
 
				+#pragma unroll
			
 
				+    for (int i = 0; i < 4; ++i) {
			
 
				+        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
			
 
				+    }
			
 
				+    return c;
			
 
				+}
			
--- a/llama/sync.sh
+++ b/llama/sync.sh
@@ -68,9 +68,11 @@ cp $src_dir/ggml/src/ggml-aarch64.c $dst_dir/ggml-aarch64.c
 
				 
			
 
				 # ggml-cuda
			
 
				 mkdir -p $dst_dir/ggml-cuda/template-instances
			
 
				+mkdir -p $dst_dir/ggml-cuda/vendors
			
 
				 cp $src_dir/ggml/src/ggml-cuda/*.cu $dst_dir/ggml-cuda/
			
 
				 cp $src_dir/ggml/src/ggml-cuda/*.cuh $dst_dir/ggml-cuda/
			
 
				 cp $src_dir/ggml/src/ggml-cuda/template-instances/*.cu $dst_dir/ggml-cuda/template-instances/
			
 
				+cp $src_dir/ggml/src/ggml-cuda/vendors/*.h $dst_dir/ggml-cuda/vendors/
			
 
				 
			
 
				 # llava
			
 
				 cp $src_dir/examples/llava/clip.cpp $dst_dir/clip.cpp