il y a 2 mois · a5272130c4
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -78,8 +78,8 @@ jobs:
 
				         include:
			
 
				           - preset: CPU
			
 
				           - preset: CUDA
			
 
				-            install: https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_522.06_windows.exe
			
 
				-            flags: '-DCMAKE_CUDA_ARCHITECTURES=87'
			
 
				+            install: https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.89_win10.exe
			
 
				+            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
			
 
				           - preset: ROCm
			
 
				             install: https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
			
 
				             flags: '-DAMDGPU_TARGETS=gfx1010'
			
@@ -102,7 +102,7 @@ jobs:
 
				           $ErrorActionPreference = "Stop"
			
 
				           if ("${{ steps.cache-install.outputs.cache-hit }}" -ne 'true') {
			
 
				             Invoke-WebRequest -Uri "${{ matrix.install }}" -OutFile "install.exe"
			
 
				-            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.8", "nvcc_11.8", "cublas_11.8", "cublas_dev_11.8")) -NoNewWindow -Wait
			
 
				+            Start-Process -FilePath .\install.exe -ArgumentList (@("-s", "cudart_11.3", "nvcc_11.3", "cublas_11.3", "cublas_dev_11.3")) -NoNewWindow -Wait
			
 
				           }
			
 
				 
			
 
				           $cudaPath = (Resolve-Path "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\*").path
			
--- a/ml/backend/ggml/ggml/.rsync-filter
+++ b/ml/backend/ggml/ggml/.rsync-filter
@@ -9,6 +9,7 @@ include src/ggml-cpu/
 
				 include src/ggml-cpu/amx/
			
 
				 include src/ggml-cpu/llamafile/
			
 
				 include src/ggml-cuda/
			
 
				+include src/ggml-cuda/vendors/
			
 
				 include src/ggml-cuda/template-instances/
			
 
				 include src/ggml-hip/
			
 
				 include src/ggml-metal/
			
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/cuda.h
@@ -3,6 +3,7 @@
 
				 #include <cuda_runtime.h>
			
 
				 #include <cuda.h>
			
 
				 #include <cublas_v2.h>
			
 
				+#include <cuda_bf16.h>
			
 
				 #include <cuda_fp16.h>
			
 
				 
			
 
				 #if CUDART_VERSION < 11020
			
--- a/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/vendors/hip.h
@@ -1,5 +1,6 @@
 
				 #pragma once
			
 
				 
			
 
				+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
			
 
				 #include <hip/hip_runtime.h>
			
 
				 #include <hipblas/hipblas.h>
			
 
				 #include <hip/hip_fp16.h>
			
@@ -8,6 +9,7 @@
 
				 // for rocblas_initialize()
			
 
				 #include "rocblas/rocblas.h"
			
 
				 #endif // __HIP_PLATFORM_AMD__
			
 
				+
			
 
				 #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
			
 
				 #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
			
 
				 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
			
@@ -19,6 +21,13 @@
 
				 #define CUBLAS_TF32_TENSOR_OP_MATH 0
			
 
				 #define CUDA_R_16F  HIPBLAS_R_16F
			
 
				 #define CUDA_R_32F  HIPBLAS_R_32F
			
 
				+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
			
 
				+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
			
 
				+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
			
 
				+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
			
 
				+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
			
 
				+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
			
 
				+#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
			
 
				 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
			
 
				 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
			
 
				 #define cublasCreate hipblasCreate
			
@@ -74,6 +83,21 @@
 
				 #define cudaMemGetInfo hipMemGetInfo
			
 
				 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
			
 
				 #define cudaSetDevice hipSetDevice
			
 
				+#define cuDeviceGet hipDeviceGet
			
 
				+#define CUdevice hipDevice_t
			
 
				+#define CUdeviceptr hipDeviceptr_t
			
 
				+#define cuMemUnmap hipMemUnmap
			
 
				+#define CUmemAccessDesc hipMemAccessDesc
			
 
				+#define cuMemAddressFree hipMemAddressFree
			
 
				+#define cuMemRelease hipMemRelease
			
 
				+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
			
 
				+#define cuMemCreate hipMemCreate
			
 
				+#define cuMemAddressReserve hipMemAddressReserve
			
 
				+#define cuMemMap hipMemMap
			
 
				+#define cuMemSetAccess hipMemSetAccess
			
 
				+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
			
 
				+#define CUmemAllocationProp hipMemAllocationProp
			
 
				+#define cuDeviceGetAttribute hipDeviceGetAttribute
			
 
				 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
			
 
				 #define cudaStreamDestroy hipStreamDestroy
			
 
				 #define cudaStreamFireAndForget hipStreamFireAndForget
			
@@ -81,6 +105,28 @@
 
				 #define cudaStreamPerThread hipStreamPerThread
			
 
				 #define cudaStreamSynchronize hipStreamSynchronize
			
 
				 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
			
 
				+#define cudaGraphExec_t hipGraphExec_t
			
 
				+#define cudaGraphNode_t hipGraphNode_t
			
 
				+#define cudaKernelNodeParams hipKernelNodeParams
			
 
				+#define cudaKernelNodeParams hipKernelNodeParams
			
 
				+#define cudaGraphExecDestroy hipGraphExecDestroy
			
 
				+#define cudaGraphLaunch hipGraphLaunch
			
 
				+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
			
 
				+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
			
 
				+#define cudaGraphNodeType hipGraphNodeType
			
 
				+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
			
 
				+#define cudaGraphInstantiate hipGraphInstantiate
			
 
				+#define cudaStreamEndCapture hipStreamEndCapture
			
 
				+#define cudaGraphDestroy hipGraphDestroy
			
 
				+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
			
 
				+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
			
 
				+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
			
 
				+#define cudaGraphNodeGetType hipGraphNodeGetType
			
 
				+#define cudaGraphGetNodes hipGraphGetNodes
			
 
				+#define cudaGraphExecUpdate hipGraphExecUpdate
			
 
				+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
			
 
				+#define cudaStreamBeginCapture hipStreamBeginCapture
			
 
				+#define cudaGraph_t hipGraph_t
			
 
				 #define cudaStream_t hipStream_t
			
 
				 #define cudaSuccess hipSuccess
			
 
				 #define __trap() do { abort(); __builtin_unreachable(); } while(0)