1 年之前 · d1b2f532b9
--- a/llm/ggml-alloc.c
+++ b/llm/ggml-alloc.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-alloc.h
+++ b/llm/ggml-alloc.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-cuda.cu
+++ b/llm/ggml-cuda.cu
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -1779,7 +1779,6 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 
															 }
														
 
															 // contiguous u/y values
														
 
															-// also used for q5_K
														
 
															 static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
														
 
															     const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
														
 
															     const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
														
@@ -1789,19 +1788,18 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
 
															     float sumf_m = 0.0f;
														
 
															 #pragma unroll
														
 
															-    for (int i0 = 0; i0 < VDR_Q4_K_Q8_1_MMQ; i0 += (QI8_1/QR4_K)) {
														
 
															+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
														
 
															         int sumi_d = 0;
														
 
															 #pragma unroll
														
 
															-        for (int i = i0; i < i0 + (QI8_1/QR4_K); ++i) {
														
 
															-            sumi_d = __dp4a(v[2*i+0], u[2*i+0], sumi_d); // SIMD dot product
														
 
															-            sumi_d = __dp4a(v[2*i+1], u[2*i+1], sumi_d); // SIMD dot product
														
 
															+        for (int j = 0; j < QI8_1; ++j) {
														
 
															+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
														
 
															         }
														
 
															-        const float2 ds8f = __half22float2(ds8[i0 / 4]);
														
 
															+        const float2 ds8f = __half22float2(ds8[i]);
														
 
															-        sumf_d += ds8f.x * (sc[i0/4] * sumi_d);
														
 
															-        sumf_m += ds8f.y *   m[i0/4]; // sum of q8_1 block * q4_K min val
														
 
															+        sumf_d += ds8f.x * (sc[i] * sumi_d);
														
 
															+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
														
 
															     }
														
 
															     const float2 dm4f = __half22float2(dm4);
														
@@ -1818,7 +1816,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
 
															 #define VDR_Q5_K_Q8_1_MMQ  8
														
 
															 // contiguous v/x values
														
 
															-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
														
 
															+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
														
 
															     const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc,
														
 
															     const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) {
														
@@ -1855,6 +1853,40 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl(
 
															 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
														
 
															 }
														
 
															+// contiguous u/y values
														
 
															+static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
														
 
															+    const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc,
														
 
															+    const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) {
														
 
															+
														
 
															+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
														
 
															+    float sumf_d = 0.0f;
														
 
															+    float sumf_m = 0.0f;
														
 
															+
														
 
															+#pragma unroll
														
 
															+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
														
 
															+        int sumi_d = 0;
														
 
															+
														
 
															+#pragma unroll
														
 
															+        for (int j = 0; j < QI8_1; ++j) {
														
 
															+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
														
 
															+        }
														
 
															+
														
 
															+        const float2 ds8f = __half22float2(ds8[i]);
														
 
															+
														
 
															+        sumf_d += ds8f.x * (sc[i] * sumi_d);
														
 
															+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
														
 
															+    }
														
 
															+
														
 
															+    const float2 dm4f = __half22float2(dm4);
														
 
															+
														
 
															+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
														
 
															+
														
 
															+#else
														
 
															+    assert(false);
														
 
															+    return 0.0f; // only to satisfy the compiler
														
 
															+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
														
 
															+}
														
 
															+
														
 
															 #define VDR_Q6_K_Q8_1_MMVQ 1
														
 
															 #define VDR_Q6_K_Q8_1_MMQ  8
														
@@ -2850,18 +2882,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
 
															     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
														
 
															     const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
														
 
															-    int v[QR4_K*VDR_Q4_K_Q8_1_MMQ];
														
 
															-
														
 
															-#pragma unroll
														
 
															-    for (int l = 0; l < VDR_Q4_K_Q8_1_MMQ; ++l) {
														
 
															-        v[l + 0]         = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 0) & 0x0F0F0F0F;
														
 
															-        v[l + (QI4_K/4)] = (x_ql[i * (WARP_SIZE + 1) + k + l] >> 4) & 0x0F0F0F0F;
														
 
															-    }
														
 
															-
														
 
															     const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
														
 
															     const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
														
 
															-    return vec_dot_q4_K_q8_1_impl_mmq(v, &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
														
 
															+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
														
 
															+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
														
 
															 }
														
 
															 static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
														
@@ -2908,7 +2933,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 
															         u[2*i+1] = q8[4];
														
 
															     }
														
 
															-    return vec_dot_q5_K_q8_1_impl(vl, vh, u, sc, m, bq5_K->dm, d8);
														
 
															+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
														
 
															 #else
														
@@ -3051,7 +3076,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
 
															     const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
														
 
															     const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
														
 
															-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
														
 
															+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
														
 
															+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
														
 
															 }
														
 
															 static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
														
@@ -3327,7 +3353,11 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
 
															 #define  MMQ_Y_Q4_1_PASCAL 64
														
 
															 #define NWARPS_Q4_1_PASCAL 8
														
 
															-template <bool need_check> static __global__ void mul_mat_q4_1(
														
 
															+template <bool need_check> static __global__ void
														
 
															+#if __CUDA_ARCH__ < CC_TURING
														
 
															+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
														
 
															+#endif // __CUDA_ARCH__ < CC_TURING
														
 
															+    mul_mat_q4_1(
														
 
															     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
														
 
															     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
														
@@ -3497,7 +3527,11 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
 
															 #define  MMQ_Y_Q3_K_PASCAL 64
														
 
															 #define NWARPS_Q3_K_PASCAL 8
														
 
															-template <bool need_check> static __global__ void mul_mat_q3_K(
														
 
															+template <bool need_check> static __global__ void
														
 
															+#if __CUDA_ARCH__ < CC_TURING
														
 
															+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
														
 
															+#endif // __CUDA_ARCH__ < CC_TURING
														
 
															+    mul_mat_q3_K(
														
 
															     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
														
 
															     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
														
@@ -3527,11 +3561,15 @@ template <bool need_check> static __global__ void mul_mat_q3_K(
 
															 #define  MMQ_X_Q4_K_AMPERE 64
														
 
															 #define  MMQ_Y_Q4_K_AMPERE 128
														
 
															 #define NWARPS_Q4_K_AMPERE 4
														
 
															-#define  MMQ_X_Q4_K_PASCAL 32
														
 
															+#define  MMQ_X_Q4_K_PASCAL 64
														
 
															 #define  MMQ_Y_Q4_K_PASCAL 64
														
 
															 #define NWARPS_Q4_K_PASCAL 8
														
 
															-template <bool need_check> static __global__ void mul_mat_q4_K(
														
 
															+template <bool need_check> static __global__ void
														
 
															+#if __CUDA_ARCH__ < CC_TURING
														
 
															+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
														
 
															+#endif // __CUDA_ARCH__ < CC_TURING
														
 
															+    mul_mat_q4_K(
														
 
															     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
														
 
															     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
														
@@ -3595,11 +3633,15 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
 
															 #define  MMQ_X_Q6_K_AMPERE 64
														
 
															 #define  MMQ_Y_Q6_K_AMPERE 64
														
 
															 #define NWARPS_Q6_K_AMPERE 4
														
 
															-#define  MMQ_X_Q6_K_PASCAL 32
														
 
															+#define  MMQ_X_Q6_K_PASCAL 64
														
 
															 #define  MMQ_Y_Q6_K_PASCAL 64
														
 
															 #define NWARPS_Q6_K_PASCAL 8
														
 
															-template <bool need_check> static __global__ void mul_mat_q6_K(
														
 
															+template <bool need_check> static __global__ void
														
 
															+#if __CUDA_ARCH__ < CC_TURING
														
 
															+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
														
 
															+#endif // __CUDA_ARCH__ < CC_TURING
														
 
															+    mul_mat_q6_K(
														
 
															     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
														
 
															     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
														
--- a/llm/ggml-cuda.h
+++ b/llm/ggml-cuda.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-metal.h
+++ b/llm/ggml-metal.h
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-metal.m
+++ b/llm/ggml-metal.m
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -154,7 +154,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
															         ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
														
 
															         if (error) {
														
 
															             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
														
 
															-            exit(1);
														
 
															+            return NULL;
														
 
															         }
														
 
															     }
														
 
															 #else
														
@@ -172,7 +172,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
															         NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
														
 
															         if (error) {
														
 
															             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
														
 
															-            exit(1);
														
 
															+            return NULL;
														
 
															         }
														
 
															 #ifdef GGML_QKK_64
														
@@ -184,7 +184,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 
															 #endif
														
 
															         if (error) {
														
 
															             fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
														
 
															-            exit(1);
														
 
															+            return NULL;
														
 
															         }
														
 
															     }
														
 
															 #endif
														
--- a/llm/ggml-metal.metal
+++ b/llm/ggml-metal.metal
@@ -1,7 +1,7 @@
 
															 //go:build darwin
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-mpi.c
+++ b/llm/ggml-mpi.c
@@ -1,7 +1,7 @@
 
															 //go:build mpi
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-mpi.h
+++ b/llm/ggml-mpi.h
@@ -1,7 +1,7 @@
 
															 //go:build mpi
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-opencl.cpp
+++ b/llm/ggml-opencl.cpp
@@ -1,7 +1,7 @@
 
															 //go:build opencl
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml-opencl.h
+++ b/llm/ggml-opencl.h
@@ -1,7 +1,7 @@
 
															 //go:build opencl
														
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml.c
+++ b/llm/ggml.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/ggml.h
+++ b/llm/ggml.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/k_quants.c
+++ b/llm/k_quants.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/k_quants.h
+++ b/llm/k_quants.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llm/llama-util.h
+++ b/llm/llama-util.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -297,20 +297,29 @@ struct llama_mmap {
 
															             throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
														
 
															         }
														
 
															-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
														
 
															         if (prefetch) {
														
 
															-            // Advise the kernel to preload the mapped memory
														
 
															-            WIN32_MEMORY_RANGE_ENTRY range;
														
 
															-            range.VirtualAddress = addr;
														
 
															-            range.NumberOfBytes = (SIZE_T)size;
														
 
															-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
														
 
															-                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
														
 
															-                        llama_format_win_err(GetLastError()).c_str());
														
 
															+            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
														
 
															+            // will dynamically load it using GetProcAddress.
														
 
															+            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
														
 
															+            HMODULE hKernel32;
														
 
															+
														
 
															+            // This call is guaranteed to succeed.
														
 
															+            hKernel32 = GetModuleHandleW(L"kernel32.dll");
														
 
															+
														
 
															+            // This call may fail if on a pre-Win8 system.
														
 
															+            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
														
 
															+
														
 
															+            if (pPrefetchVirtualMemory) {
														
 
															+                // Advise the kernel to preload the mapped memory.
														
 
															+                WIN32_MEMORY_RANGE_ENTRY range;
														
 
															+                range.VirtualAddress = addr;
														
 
															+                range.NumberOfBytes = (SIZE_T)size;
														
 
															+                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
														
 
															+                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
														
 
															+                            llama_format_win_err(GetLastError()).c_str());
														
 
															+                }
														
 
															             }
														
 
															         }
														
 
															-        #else
														
 
															-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
														
 
															-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
														
 
															     }
														
 
															     ~llama_mmap() {
														
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -3363,6 +3363,12 @@ struct llama_context * llama_new_context_with_model(
 
															         // this allocates all Metal resources and memory buffers
														
 
															         ctx->ctx_metal = ggml_metal_init(1);
														
 
															+        if (!ctx->ctx_metal) {
														
 
															+            LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
														
 
															+            llama_free(ctx);
														
 
															+            return NULL;
														
 
															+        }
														
 
															+
														
 
															         void * data_ptr  = NULL;
														
 
															         size_t data_size = 0;
														
--- a/llm/llama.h
+++ b/llm/llama.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
														
 
															+ * llama.cpp - git 3ebb00935f3f0522b75df49c2769ab1774b91380
														
 
															  *
														
 
															  * MIT License
														
 
															  *