1 gadu atpakaļ · dde880290c
--- a/llama/ggml-cuda.cu
+++ b/llama/ggml-cuda.cu
--- a/llama/ggml-cuda.h
+++ b/llama/ggml-cuda.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-metal.h
+++ b/llama/ggml-metal.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/ggml-metal.m
+++ b/llama/ggml-metal.m
@@ -1,7 +1,7 @@
 
															 // +build darwin
														
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -722,8 +722,8 @@ void ggml_metal_graph_compute(
 
															                                             GGML_ASSERT(ne02 == 1);
														
 
															                                             GGML_ASSERT(ne12 == 1);
														
 
															-                                            nth0 = 4;
														
 
															-                                            nth1 = 16;
														
 
															+                                            nth0 = 2;
														
 
															+                                            nth1 = 32;
														
 
															                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
														
 
															                                         } break;
														
 
															                                     case GGML_TYPE_Q5_K:
														
@@ -731,8 +731,8 @@ void ggml_metal_graph_compute(
 
															                                             GGML_ASSERT(ne02 == 1);
														
 
															                                             GGML_ASSERT(ne12 == 1);
														
 
															-                                            nth0 = 4;
														
 
															-                                            nth1 = 16;
														
 
															+                                            nth0 = 2;
														
 
															+                                            nth1 = 32;
														
 
															                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
														
 
															                                         } break;
														
 
															                                     case GGML_TYPE_Q6_K:
														
@@ -740,8 +740,8 @@ void ggml_metal_graph_compute(
 
															                                             GGML_ASSERT(ne02 == 1);
														
 
															                                             GGML_ASSERT(ne12 == 1);
														
 
															-                                            nth0 = 4;
														
 
															-                                            nth1 = 16;
														
 
															+                                            nth0 = 2;
														
 
															+                                            nth1 = 32;
														
 
															                                             [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
														
 
															                                         } break;
														
 
															                                     default:
														
@@ -767,15 +767,18 @@ void ggml_metal_graph_compute(
 
															                                 [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
														
 
															                                 [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
														
 
															-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
														
 
															-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
														
 
															-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
														
 
															+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
														
 
															+                                    src0t == GGML_TYPE_Q4_K) {
														
 
															+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
														
 
															+                                }
														
 
															+                                else if (src0t == GGML_TYPE_Q5_K) {
														
 
															+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
														
 
															+                                }
														
 
															+                                else if (src0t == GGML_TYPE_Q6_K) {
														
 
															+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
														
 
															                                 }
														
 
															                                 else if (src0t == GGML_TYPE_Q2_K ||
														
 
															-                                         src0t == GGML_TYPE_Q3_K ||
														
 
															-                                         src0t == GGML_TYPE_Q4_K ||
														
 
															-                                         src0t == GGML_TYPE_Q5_K ||
														
 
															-                                         src0t == GGML_TYPE_Q6_K) {
														
 
															+                                         src0t == GGML_TYPE_Q3_K) {
														
 
															                                     [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
														
 
															                                     [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
														
 
															                                 } else {
														
@@ -821,7 +824,7 @@ void ggml_metal_graph_compute(
 
															                             const float eps = 1e-6f;
														
 
															-                            const int nth = 256;
														
 
															+                            const int nth = 512;
														
 
															                             [encoder setComputePipelineState:ctx->pipeline_rms_norm];
														
 
															                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
														
@@ -829,7 +832,7 @@ void ggml_metal_graph_compute(
 
															                             [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
														
 
															                             [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
														
 
															                             [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
														
 
															-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
														
 
															+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
														
 
															                             const int64_t nrows = ggml_nrows(src0);
														
@@ -910,28 +913,35 @@ void ggml_metal_graph_compute(
 
															                             const int n_past = ((int32_t *)(src1->data))[0];
														
 
															+                            float freq_base;
														
 
															+                            float freq_scale;
														
 
															+                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
														
 
															+                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
														
 
															+
														
 
															                             [encoder setComputePipelineState:ctx->pipeline_rope];
														
 
															                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
														
 
															                             [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
														
 
															-                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
														
 
															-                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
														
 
															-                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
														
 
															-                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
														
 
															-                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
														
 
															-                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
														
 
															-                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
														
 
															-                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
														
 
															-                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
														
 
															-                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
														
 
															-                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
														
 
															-                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
														
 
															-                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
														
 
															-                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
														
 
															-                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
														
 
															-                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
														
 
															-                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
														
 
															-                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
														
 
															-                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
														
 
															+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
														
 
															+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
														
 
															+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
														
 
															+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
														
 
															+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
														
 
															+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
														
 
															+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
														
 
															+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
														
 
															+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
														
 
															+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
														
 
															+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
														
 
															+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
														
 
															+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
														
 
															+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
														
 
															+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
														
 
															+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
														
 
															+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:18];
														
 
															+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:19];
														
 
															+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:20];
														
 
															+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
														
 
															+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];
														
 
															                             [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
														
 
															                         } break;
														
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
 
															         threadgroup float  * sum [[threadgroup(0)]],
														
 
															         uint tgpig[[threadgroup_position_in_grid]],
														
 
															         uint tpitg[[thread_position_in_threadgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]],
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															         uint   ntg[[threads_per_threadgroup]]) {
														
 
															-    device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
														
 
															+    device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
														
 
															+    device const float * x_scalar = (device const float *) x;
														
 
															+    float4 sumf=0;
														
 
															+    float all_sum=0;
														
 
															     // parallel sum
														
 
															-    sum[tpitg] = 0.0f;
														
 
															-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
														
 
															-        sum[tpitg] += x[i00] * x[i00];
														
 
															+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
														
 
															+        sumf += x[i00] * x[i00];
														
 
															+    }
														
 
															+    all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
														
 
															+    all_sum = simd_sum(all_sum);
														
 
															+    if (tiisg == 0) {
														
 
															+        sum[sgitg] = all_sum;
														
 
															     }
														
 
															-    // reduce
														
 
															     threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    for (uint i = ntg/2; i > 0; i /= 2) {
														
 
															-        if (tpitg < i) {
														
 
															-            sum[tpitg] += sum[tpitg + i];
														
 
															-        }
														
 
															-        threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															+    // broadcast, simd group number is ntg / 32
														
 
															+    for (int i = ntg / 32 / 2; i > 0; i /= 2) {
														
 
															+       if (tpitg < i) {
														
 
															+           sum[tpitg] += sum[tpitg + i];
														
 
															+       }
														
 
															     }
														
 
															-
														
 
															-    // broadcast
														
 
															     if (tpitg == 0) {
														
 
															+        for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
														
 
															         sum[0] /= ne00;
														
 
															     }
														
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
 
															     const float mean  = sum[0];
														
 
															     const float scale = 1.0f/sqrt(mean + eps);
														
 
															-    device float * y = dst + tgpig*ne00;
														
 
															-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
														
 
															+    device float4 * y = (device float4 *) (dst + tgpig*ne00);
														
 
															+    device float * y_scalar = (device float *) y;
														
 
															+    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
														
 
															         y[i00] = x[i00] * scale;
														
 
															     }
														
 
															+    if (tpitg == 0) {
														
 
															+        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
														
 
															+    }
														
 
															 }
														
 
															-kernel void kernel_mul_mat_q4_0_f32(
														
 
															-        device const  void * src0,
														
 
															-        device const float * src1,
														
 
															-        device       float * dst,
														
 
															-        constant   int64_t & ne00,
														
 
															-        constant   int64_t & ne10,
														
 
															-        constant   int64_t & ne0,
														
 
															-        threadgroup float  * sum [[threadgroup(0)]],
														
 
															-        uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															-        uint2 tpitg[[thread_position_in_threadgroup]],
														
 
															-        uint2  tptg[[threads_per_threadgroup]]) {
														
 
															-    const int nb = ne00/QK4_0;
														
 
															+// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
														
 
															+float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
														
 
															+    float d = qb_curr->d;
														
 
															+    float4 acc = 0.f;
														
 
															+    device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
														
 
															+    for (int i = 0; i < 16; i+=2) {
														
 
															+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
														
 
															+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
														
 
															+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
														
 
															+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
														
 
															+    }
														
 
															+    return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
														
 
															+}
														
 
															-    const int64_t r0 = tgpig.x;
														
 
															-    const int64_t r1 = tgpig.y;
														
 
															+// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
														
 
															+float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
														
 
															+    float d = qb_curr->d;
														
 
															+    float m = qb_curr->m;
														
 
															+    float4 acc = 0.f;
														
 
															+    device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
														
 
															+    for (int i = 0; i < 16; i+=2) {
														
 
															+        acc[0] += yl[i]      * (qs[i / 2] & 0x000F);
														
 
															+        acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
														
 
															+        acc[2] += yl[i +  1] * (qs[i / 2] & 0x0F00);
														
 
															+        acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
														
 
															+    }
														
 
															+    return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
														
 
															+}
														
 
															-    device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
														
 
															+// putting them in the kernel cause a significant performance penalty
														
 
															+#define N_DST 4 // each SIMD group works on 4 rows
														
 
															+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
														
 
															+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
														
 
															+template<typename block_q_type>
														
 
															+void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
														
 
															+                    int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
														
 
															+                    uint2 tgpig, uint tiisg, uint sgitg) {
														
 
															+    const int nb = ne00/QK4_0;
														
 
															+    const int r0 = tgpig.x;
														
 
															+    const int r1 = tgpig.y;
														
 
															+    device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
														
 
															     device const float      * y = (device const float      *) src1 + r1*ne10;
														
 
															-
														
 
															-    const int nth = tptg.x*tptg.y;
														
 
															-    const int ith = tptg.y*tpitg.x + tpitg.y;
														
 
															-
														
 
															-    const int ix = tpitg.y/4;           // 0 or 1
														
 
															-    const int iy = tpitg.y - 4*ix;      // 0...3
														
 
															-
														
 
															-    const int first = 4 * iy;
														
 
															-
														
 
															-    float sumf = 0;
														
 
															-
														
 
															-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
														
 
															-
														
 
															-        const float d = (float)x[i].d;
														
 
															-
														
 
															-        device const uint8_t * xl = x[i].qs + first;
														
 
															-        device const float   * yl = y + i * QK4_0 + first;
														
 
															-
														
 
															-        float2 acc = {0.0f, 0.0f};
														
 
															-
														
 
															-        for (int j = 0; j < 4; ++j) {
														
 
															-
														
 
															-            acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
														
 
															-            acc[1] += yl[j] + yl[j+16];
														
 
															-
														
 
															+    float4 y_curr[8];       // src1 vector cache
														
 
															+    float sumf[N_DST]={0.f}, all_sum;
														
 
															+    thread float * yl=(thread float *)y_curr;
														
 
															+
														
 
															+    // each thread in a SIMD group deals with 1 block.
														
 
															+    for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
														
 
															+        float sumy = 0;
														
 
															+        for (int i = 0; i < QK4_0 / 4; i++) {
														
 
															+            y_curr[i] = *((device float4  *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
														
 
															+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
														
 
															         }
														
 
															-        sumf += d * (acc[0] - 8.f*acc[1]);
														
 
															+        for (int row = 0; row < N_DST; row++) {
														
 
															+            sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
														
 
															+        }
														
 
															     }
														
 
															-    sum[ith] = sumf;
														
 
															+    // from now loads two rows every time and 16 blocks per row
														
 
															+    int ir = tiisg / (N_SIMDWIDTH / 2);
														
 
															+    int ib = tiisg % (N_SIMDWIDTH / 2);
														
 
															+    for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
														
 
															+        int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
														
 
															+        float sumy = 0;
														
 
															+        for (int i = 0; i < QK4_0 / 4; i++) {
														
 
															+            y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
														
 
															+            sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
														
 
															+        }
														
 
															-    //
														
 
															-    // Accumulate the sum from all threads in the threadgroup
														
 
															-    //
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%4 == 0) {
														
 
															-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%16 == 0) {
														
 
															-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
														
 
															+        for (int row = 0; row < N_DST; row+=2) {
														
 
															+            if (nb_start + ib < nb) {
														
 
															+                sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
														
 
															+            }
														
 
															+        }
														
 
															     }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith == 0) {
														
 
															-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
														
 
															-        dst[r1*ne0 + r0] = sum[0];
														
 
															+
														
 
															+    for (int row = 0; row < N_DST; ++row) {
														
 
															+        all_sum = simd_sum(sumf[row]);
														
 
															+        if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
														
 
															+            dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
														
 
															+        }
														
 
															     }
														
 
															 }
														
 
															-kernel void kernel_mul_mat_q4_1_f32(
														
 
															+kernel void kernel_mul_mat_q4_0_f32(
														
 
															         device const  void * src0,
														
 
															         device const float * src1,
														
 
															         device       float * dst,
														
 
															         constant   int64_t & ne00,
														
 
															         constant   int64_t & ne10,
														
 
															         constant   int64_t & ne0,
														
 
															-        threadgroup float  * sum [[threadgroup(0)]],
														
 
															+        constant   int64_t & ne01[[buffer(4)]],
														
 
															         uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															-        uint2 tpitg[[thread_position_in_threadgroup]],
														
 
															-        uint2  tptg[[threads_per_threadgroup]]) {
														
 
															-    const int nb = ne00/QK4_1;
														
 
															-
														
 
															-    const int64_t r0 = tgpig.x;
														
 
															-    const int64_t r1 = tgpig.y;
														
 
															-
														
 
															-    device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
														
 
															-    device const float      * y = (device const float      *) src1 + r1*ne10;
														
 
															-
														
 
															-    const uint nth = tptg.x*tptg.y;
														
 
															-    const uint ith = tptg.y*tpitg.x + tpitg.y;
														
 
															-
														
 
															-    const int ix = tpitg.y/4;           // 0 or 1
														
 
															-    const int iy = tpitg.y - 4*ix;      // 0...3
														
 
															-
														
 
															-    const int first = 4 * iy;
														
 
															-
														
 
															-    float sumf = 0;
														
 
															-
														
 
															-    for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
														
 
															-
														
 
															-        const float d = (float)x[i].d;
														
 
															-        const float m = (float)x[i].m;
														
 
															-
														
 
															-        device const uint8_t * xl = x[i].qs + first;
														
 
															-        device const float   * yl = y + i * QK4_1 + first;
														
 
															-
														
 
															-        float2 acc = {0.0f, 0.0f};
														
 
															-
														
 
															-        for (int j = 0; j < 4; ++j) {
														
 
															-
														
 
															-            acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
														
 
															-            acc[1] += yl[j+16] * (d * (xl[j] >>  4) + m);
														
 
															-
														
 
															-        }
														
 
															-
														
 
															-        sumf += acc[0] + acc[1];
														
 
															-    }
														
 
															-
														
 
															-    sum[ith] = sumf;
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															+    mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
														
 
															+}
														
 
															-    //
														
 
															-    // Accumulate the sum from all threads in the threadgroup
														
 
															-    //
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%4 == 0) {
														
 
															-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%16 == 0) {
														
 
															-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith == 0) {
														
 
															-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
														
 
															-        dst[r1*ne0 + r0] = sum[0];
														
 
															-    }
														
 
															+kernel void kernel_mul_mat_q4_1_f32(
														
 
															+        device const  void * src0,
														
 
															+        device const float * src1,
														
 
															+        device       float * dst,
														
 
															+        constant   int64_t & ne00,
														
 
															+        constant   int64_t & ne10,
														
 
															+        constant   int64_t & ne0,
														
 
															+        constant   int64_t & ne01[[buffer(4)]],
														
 
															+        uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															+     mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
														
 
															 }
														
 
															 kernel void kernel_mul_mat_f16_f32(
														
@@ -641,17 +628,19 @@ kernel void kernel_rope(
 
															         constant       int & n_past,
														
 
															         constant       int & n_dims,
														
 
															         constant       int & mode,
														
 
															+        constant     float & freq_base,
														
 
															+        constant     float & freq_scale,
														
 
															         uint3 tpig[[thread_position_in_grid]]) {
														
 
															     const int64_t i3 = tpig[2];
														
 
															     const int64_t i2 = tpig[1];
														
 
															     const int64_t i1 = tpig[0];
														
 
															     const bool is_neox = mode & 2;
														
 
															-    const float theta_scale = pow(10000.0, -2.0f/n_dims);
														
 
															+    const float theta_scale = pow(freq_base, -2.0f/n_dims);
														
 
															     const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
														
 
															-    float theta = (float)p;
														
 
															+    float theta = freq_scale * (float)p;
														
 
															     if (!is_neox) {
														
 
															         for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
														
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
 
															 }
														
 
															+#if QK_K == 256
														
 
															 kernel void kernel_mul_mat_q4_K_f32(
														
 
															         device const  void * src0,
														
 
															         device const float * src1,
														
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
 
															         constant   int64_t & ne00,
														
 
															         constant   int64_t & ne10,
														
 
															         constant   int64_t & ne0,
														
 
															-        threadgroup float  * sum [[threadgroup(0)]],
														
 
															+        constant   int64_t & ne01[[buffer(4)]],
														
 
															         uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															-        uint2 tpitg[[thread_position_in_threadgroup]],
														
 
															-        uint2  tptg[[threads_per_threadgroup]]) {
														
 
															-
														
 
															-    const int nb = ne00/QK_K;
														
 
															-
														
 
															-    const int64_t r0 = tgpig.x;
														
 
															-    const int64_t r1 = tgpig.y;
														
 
															-
														
 
															-    const int nth = tptg.x*tptg.y;
														
 
															-    const int ith = tptg.y*tpitg.x + tpitg.y;
														
 
															-
														
 
															-    device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
														
 
															-    device const float     * yy = (device const float      *) src1 + r1*ne10;
														
 
															-
														
 
															-    float sumf = 0;
														
 
															-
														
 
															-#if QK_K == 256
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															     const uint16_t kmask1 = 0x3f3f;
														
 
															     const uint16_t kmask2 = 0x0f0f;
														
 
															     const uint16_t kmask3 = 0xc0c0;
														
 
															-    const int tid = tpitg.y;   // 0...16
														
 
															-    const int il  = tid/4;     // 0...3
														
 
															-    const int ir  = tid - 4*il;// 0...3
														
 
															-    const int n   = 4;
														
 
															+    const int ix = tiisg/8;  // 0...3
														
 
															+    const int it = tiisg%8;  // 0...7
														
 
															+    const int im = it/4;     // 0 or 1
														
 
															+    const int ir = it%4;     // 0...3
														
 
															-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
														
 
															-    const int in = il%2;
														
 
															+    const int nb = ne00/QK_K;
														
 
															+    const int r0 = tgpig.x;
														
 
															+    const int r1 = tgpig.y;
														
 
															+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
														
 
															+    const int ib_row = first_row * nb;
														
 
															+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
														
 
															+    device const float      * y = (device const float      *) src1 + r1*ne10;
														
 
															+    float yl[16];
														
 
															+    float yh[16];
														
 
															+    float sumf[N_DST]={0.f}, all_sum;
														
 
															-    const int l0 = n*(2*ir + in);
														
 
															-    const int q_offset = 32*im + l0;
														
 
															-    const int y_offset = 64*im + l0;
														
 
															+    const int step = sizeof(block_q4_K) * nb / 2;
														
 
															-    uchar2 sc1, sc2, sc3, sc4;
														
 
															+    device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;
														
 
															-    for (int i = tpitg.x; i < nb; i += tptg.x) {
														
 
															+    uint16_t sc16[4];
														
 
															+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
														
 
															-        device const uint8_t * q1 = (x + i)->qs + q_offset;
														
 
															-        device const uint8_t * q2 = q1 + 64;
														
 
															-        device const float   * y1 = yy + i*QK_K + y_offset;
														
 
															-        device const float   * y2 = y1 + 128;
														
 
															+    for (int ib = ix; ib < nb; ib += 4) {
														
 
															-        const float dall = (float)((x + i)->d);
														
 
															-        const float dmin = (float)((x + i)->dmin);
														
 
															+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
														
 
															+        for (int i = 0; i < 8; ++i) {
														
 
															+            yl[i+0] = y4[i+  0]; sumy[0] += yl[i+0];
														
 
															+            yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
														
 
															+            yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
														
 
															+            yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
														
 
															+        }
														
 
															-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
														
 
															-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
														
 
															-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
														
 
															-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
														
 
															-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
														
 
															+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
														
 
															+        device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
														
 
															+        device const half     * dh = &x[ib].d;
														
 
															+
														
 
															+        for (int row = 0; row < N_DST; row++) {
														
 
															+
														
 
															+            sc16[0] = sc[0] & kmask1;
														
 
															+            sc16[1] = sc[2] & kmask1;
														
 
															+            sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
														
 
															+            sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
														
 
															+
														
 
															+            device const uint16_t * q2 = q1 + 32;
														
 
															+
														
 
															+            float4 acc1 = {0.f, 0.f, 0.f, 0.f};
														
 
															+            float4 acc2 = {0.f, 0.f, 0.f, 0.f};
														
 
															+            for (int i = 0; i < 8; i += 2) {
														
 
															+                acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
														
 
															+                acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
														
 
															+                acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
														
 
															+                acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
														
 
															+                acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
														
 
															+                acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
														
 
															+                acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
														
 
															+                acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
														
 
															+            }
														
 
															-        float4 s = {0.f, 0.f, 0.f, 0.f};
														
 
															-        float smin = 0;
														
 
															-        for (int l = 0; l < n; ++l) {
														
 
															+            float dall = dh[0];
														
 
															+            float dmin = dh[1];
														
 
															+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
														
 
															+                                 (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
														
 
															+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
														
 
															+                                 (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
														
 
															+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
														
 
															+
														
 
															+            q1 += step;
														
 
															+            sc += step;
														
 
															+            dh += step;
														
 
															+        }
														
 
															-            s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
														
 
															-            s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
														
 
															-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
														
 
															+        y4 += 4 * QK_K;
														
 
															+    }
														
 
															+    for (int row = 0; row < N_DST; ++row) {
														
 
															+        all_sum = simd_sum(sumf[row]);
														
 
															+        if (tiisg == 0) {
														
 
															+            dst[r1*ne0 + first_row + row] = all_sum;
														
 
															         }
														
 
															-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
														
 
															-
														
 
															     }
														
 
															+}
														
 
															 #else
														
 
															-    uint16_t aux16[2];
														
 
															-    thread const uint8_t * scales = (thread const uint8_t *)aux16;
														
 
															+kernel void kernel_mul_mat_q4_K_f32(
														
 
															+        device const  void * src0,
														
 
															+        device const float * src1,
														
 
															+        device       float * dst,
														
 
															+        constant   int64_t & ne00,
														
 
															+        constant   int64_t & ne10,
														
 
															+        constant   int64_t & ne0,
														
 
															+        constant   int64_t & ne01[[buffer(4)]],
														
 
															+        uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															-    const int il  = 4*tpitg.x;
														
 
															+    const int ix = tiisg/4;  // 0...7
														
 
															+    const int it = tiisg%4;  // 0...3
														
 
															-    for (int i = tpitg.y; i < nb; i += tptg.y) {
														
 
															+    const int nb = ne00/QK_K;
														
 
															+    const int r0 = tgpig.x;
														
 
															+    const int r1 = tgpig.y;
														
 
															+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
														
 
															+    const int ib_row = first_row * nb;
														
 
															+    device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
														
 
															+    device const float      * y = (device const float      *) src1 + r1*ne10;
														
 
															+    float yl[8];
														
 
															+    float yh[8];
														
 
															+    float sumf[N_DST]={0.f}, all_sum;
														
 
															-        device const uint8_t * q = x[i].qs + il;
														
 
															-        device const float   * y = yy + i * QK_K + il;
														
 
															+    const int step = sizeof(block_q4_K) * nb / 2;
														
 
															-        const float d = (float)x[i].d[0];
														
 
															-        const float m = (float)x[i].d[1];
														
 
															+    device const float * y4 = y + ix * QK_K + 8 * it;
														
 
															-        device const uint16_t * a = (device const uint16_t *)x[i].scales;
														
 
															-        aux16[0] = a[0] & 0x0f0f;
														
 
															-        aux16[1] = (a[0] >> 4) & 0x0f0f;
														
 
															+    uint16_t sc16[4];
														
 
															-        for (int l = 0; l < 4; ++l) {
														
 
															-            sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
														
 
															-                  + d * scales[1] * (y[l+32] * (q[l] >>  4) + y[l+48] * (q[l+16] >>  4)) - m * scales[3] * (y[l+32] + y[l+48]);
														
 
															+    for (int ib = ix; ib < nb; ib += 8) {
														
 
															+
														
 
															+        float2 sumy = {0.f, 0.f};
														
 
															+        for (int i = 0; i < 8; ++i) {
														
 
															+            yl[i] = y4[i+ 0]; sumy[0] += yl[i];
														
 
															+            yh[i] = y4[i+32]; sumy[1] += yh[i];
														
 
															         }
														
 
															-    }
														
 
															-#endif
														
 
															-    sum[ith] = sumf;
														
 
															+        device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
														
 
															+        device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
														
 
															+        device const half     * dh = x[ib].d;
														
 
															-    //
														
 
															-    // Accumulate the sum from all threads in the threadgroup
														
 
															-    // This version is slightly faster than the commented out one below,
														
 
															-    // which I copy-pasted from ggerganov's q4_0 dot product for metal.
														
 
															-    //
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%4 == 0) {
														
 
															-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%16 == 0) {
														
 
															-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith == 0) {
														
 
															-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
														
 
															-        dst[r1*ne0 + r0] = sum[0];
														
 
															-    }
														
 
															+        for (int row = 0; row < N_DST; row++) {
														
 
															+
														
 
															+            sc16[0] = sc[0] & 0x000f;
														
 
															+            sc16[1] = sc[0] & 0x0f00;
														
 
															+            sc16[2] = sc[0] & 0x00f0;
														
 
															+            sc16[3] = sc[0] & 0xf000;
														
 
															+
														
 
															+            float2 acc1 = {0.f, 0.f};
														
 
															+            float2 acc2 = {0.f, 0.f};
														
 
															+            for (int i = 0; i < 8; i += 2) {
														
 
															+                acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
														
 
															+                acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
														
 
															+                acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
														
 
															+                acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
														
 
															+            }
														
 
															+
														
 
															+            float dall = dh[0];
														
 
															+            float dmin = dh[1];
														
 
															+            sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
														
 
															+                                 (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
														
 
															+                         dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
														
 
															+
														
 
															+            qs += step;
														
 
															+            sc += step;
														
 
															+            dh += step;
														
 
															+        }
														
 
															-    //// accumulate the sum from all threads in the threadgroup
														
 
															-    //threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    //for (uint i = nth/2; i > 0; i /= 2) {
														
 
															-    //    if (ith < i) {
														
 
															-    //        sum[ith] += sum[ith + i];
														
 
															-    //    }
														
 
															-    //    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    //}
														
 
															+        y4 += 8 * QK_K;
														
 
															+    }
														
 
															-    //if (ith == 0) {
														
 
															-    //    dst[r1*ne0 + r0] = sum[0];
														
 
															-    //}
														
 
															+    for (int row = 0; row < N_DST; ++row) {
														
 
															+        all_sum = simd_sum(sumf[row]);
														
 
															+        if (tiisg == 0) {
														
 
															+            dst[r1*ne0 + first_row + row] = all_sum;
														
 
															+        }
														
 
															+    }
														
 
															 }
														
 
															+#endif
														
 
															 kernel void kernel_mul_mat_q5_K_f32(
														
 
															         device const  void * src0,
														
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
 
															         constant   int64_t & ne00,
														
 
															         constant   int64_t & ne10,
														
 
															         constant   int64_t & ne0,
														
 
															-        threadgroup float  * sum [[threadgroup(0)]],
														
 
															         uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															-        uint2 tpitg[[thread_position_in_threadgroup]],
														
 
															-        uint2  tptg[[threads_per_threadgroup]]) {
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															     const int nb = ne00/QK_K;
														
 
															     const int64_t r0 = tgpig.x;
														
 
															     const int64_t r1 = tgpig.y;
														
 
															-    device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
														
 
															+    const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
														
 
															+
														
 
															+    device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
														
 
															     device const float     * yy = (device const float      *) src1 + r1*ne10;
														
 
															-    const int nth = tptg.x*tptg.y;
														
 
															-    const int ith = tptg.y*tpitg.x + tpitg.y;
														
 
															+    float sumf[2]={0.f};
														
 
															-    float sumf = 0;
														
 
															+    const int step = sizeof(block_q5_K) * nb;
														
 
															 #if QK_K == 256
														
 
															+#
														
 
															+    float yl[16], yh[16];
														
 
															     const uint16_t kmask1 = 0x3f3f;
														
 
															     const uint16_t kmask2 = 0x0f0f;
														
 
															     const uint16_t kmask3 = 0xc0c0;
														
 
															-    const int tid = tpitg.y;   // 0...16
														
 
															-    const int il  = tid/4;     // 0...3
														
 
															-    const int ir  = tid - 4*il;// 0...3
														
 
															-    const int n   = 4;
														
 
															-
														
 
															-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
														
 
															-    const int in = il%2;
														
 
															+    const int tid = tiisg/4;
														
 
															+    const int ix  = tiisg%4;
														
 
															+    const int im  = tid/4;
														
 
															+    const int ir  = tid%4;
														
 
															+    const int n   = 8;
														
 
															-    const int l0 = n*(2*ir + in);
														
 
															+    const int l0 = n*ir;
														
 
															     const int q_offset = 32*im + l0;
														
 
															     const int y_offset = 64*im + l0;
														
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
 
															     const uint8_t hm3 = hm1 << 4;
														
 
															     const uint8_t hm4 = hm2 << 4;
														
 
															-    uchar2 sc1, sc2, sc3, sc4;
														
 
															+    uint16_t sc16[4];
														
 
															+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
														
 
															-    for (int i = tpitg.x; i < nb; i += tptg.x) {
														
 
															+    device const float * y1 = yy + ix*QK_K + y_offset;
														
 
															+
														
 
															+    for (int i = ix; i < nb; i += 4) {
														
 
															+
														
 
															+        device const uint8_t * q1 = x[i].qs + q_offset;
														
 
															+        device const uint8_t * qh = x[i].qh + l0;
														
 
															+        device const half * dh = &x[i].d;
														
 
															+        device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
														
 
															-        device const uint8_t * q1 = (x + i)->qs + q_offset;
														
 
															-        device const uint8_t * q2 = q1 + 64;
														
 
															-        device const uint8_t * qh = (x + i)->qh + l0;
														
 
															-        device const float   * y1 = yy + i*QK_K + y_offset;
														
 
															-        device const float   * y2 = y1 + 128;
														
 
															+        device const float * y2 = y1 + 128;
														
 
															+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
														
 
															+        for (int l = 0; l < 8; ++l) {
														
 
															+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
														
 
															+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
														
 
															+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
														
 
															+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
														
 
															+        }
														
 
															-        const float dall = (float)((x + i)->d);
														
 
															-        const float dmin = (float)((x + i)->dmin);
														
 
															+        for (int row = 0; row < 2; ++row) {
														
 
															-        device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
														
 
															-        sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
														
 
															-        sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
														
 
															-        sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
														
 
															-        sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
														
 
															+            device const uint8_t * q2 = q1 + 64;
														
 
															-        float4 s = {0.f, 0.f, 0.f, 0.f};
														
 
															-        float smin = 0;
														
 
															-        for (int l = 0; l < n; ++l) {
														
 
															+            sc16[0] = a[0] & kmask1;
														
 
															+            sc16[1] = a[2] & kmask1;
														
 
															+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
														
 
															+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
														
 
															-            s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
														
 
															-            s[1] += y1[l+32] * ((q1[l] >>  4) + (qh[l] & hm2 ? 16 : 0));
														
 
															-            s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
														
 
															-            s[3] += y2[l+32] * ((q2[l] >>  4) + (qh[l] & hm4 ? 16 : 0));
														
 
															-            smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
														
 
															+            float4 acc = {0.f, 0.f, 0.f, 0.f};
														
 
															+            for (int l = 0; l < n; ++l) {
														
 
															+                uint8_t h = qh[l];
														
 
															+                acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
														
 
															+                acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
														
 
															+                acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
														
 
															+                acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
														
 
															+            }
														
 
															+            const float dall = dh[0];
														
 
															+            const float dmin = dh[1];
														
 
															+            sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
														
 
															+                         dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
														
 
															+
														
 
															+            q1 += step;
														
 
															+            qh += step;
														
 
															+            dh += step/2;
														
 
															+            a  += step/2;
														
 
															         }
														
 
															-        sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
														
 
															+
														
 
															+        y1 += 4 * QK_K;
														
 
															     }
														
 
															 #else
														
 
															-    const int il  = 4 * tpitg.x;  // 0, 4, 8, 12
														
 
															-    const int im  = il/8;         // 0, 0, 1, 1
														
 
															-    const int in  = il%8;         // 0, 4, 0, 4
														
 
															+    float yl[8], yh[8];
														
 
															-    for (int i = tpitg.y; i < nb; i += tptg.y) {
														
 
															+    const int il = 4 * (tiisg/8);  // 0, 4, 8, 12
														
 
															+    const int ix = tiisg%8;
														
 
															+    const int im = il/8;         // 0, 0, 1, 1
														
 
															+    const int in = il%8;         // 0, 4, 0, 4
														
 
															-        const float d = (float)x[i].d;
														
 
															+    device const float * y = yy + ix*QK_K + il;
														
 
															+
														
 
															+    for (int i = ix; i < nb; i += 8) {
														
 
															+
														
 
															+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
														
 
															+        for (int l = 0; l < 4; ++l) {
														
 
															+            yl[l+0] = y[l+ 0];
														
 
															+            yl[l+4] = y[l+16];
														
 
															+            yh[l+0] = y[l+32];
														
 
															+            yh[l+4] = y[l+48];
														
 
															+        }
														
 
															+
														
 
															+        device const half * dh = &x[i].d;
														
 
															         device const uint8_t * q = x[i].qs + il;
														
 
															         device const uint8_t * h = x[i].qh + in;
														
 
															         device const int8_t  * s = x[i].scales;
														
 
															-        device const float   * y = yy + i*QK_K + il;
														
 
															-        for (int l = 0; l < 4; ++l) {
														
 
															-            const uint8_t hl = h[l] >> im;
														
 
															-            sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
														
 
															-                  + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
														
 
															-                  + y[l+32] * d * s[2] * ((q[l+ 0] >>  4) - (hl & 0x10 ? 0 : 16))
														
 
															-                  + y[l+48] * d * s[3] * ((q[l+16] >>  4) - (hl & 0x40 ? 0 : 16));
														
 
															+        for (int row = 0; row < 2; ++row) {
														
 
															+
														
 
															+            const float d = dh[0];
														
 
															+
														
 
															+            float2 acc = {0.f, 0.f};
														
 
															+            for (int l = 0; l < 4; ++l) {
														
 
															+                const uint8_t hl = h[l] >> im;
														
 
															+                acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
														
 
															+                        + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
														
 
															+                acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
														
 
															+                        + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
														
 
															+            }
														
 
															+            sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
														
 
															+
														
 
															+            q += step;
														
 
															+            h += step;
														
 
															+            s += step;
														
 
															+            dh += step/2;
														
 
															+
														
 
															         }
														
 
															+
														
 
															+        y += 8 * QK_K;
														
 
															     }
														
 
															 #endif
														
 
															-    sum[ith] = sumf;
														
 
															-    //
														
 
															-    // Accumulate the sum from all threads in the threadgroup
														
 
															-    //
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%4 == 0) {
														
 
															-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%16 == 0) {
														
 
															-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith == 0) {
														
 
															-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
														
 
															-        dst[r1*ne0 + r0] = sum[0];
														
 
															+    for (int row = 0; row < 2; ++row) {
														
 
															+        const float tot = simd_sum(sumf[row]);
														
 
															+        if (tiisg == 0) {
														
 
															+            dst[r1*ne0 + first_row + row] = tot;
														
 
															+        }
														
 
															     }
														
 
															 }
														
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
 
															         constant   int64_t & ne00,
														
 
															         constant   int64_t & ne10,
														
 
															         constant   int64_t & ne0,
														
 
															-        threadgroup float  * sum [[threadgroup(0)]],
														
 
															         uint2 tgpig[[threadgroup_position_in_grid]],
														
 
															-        uint2 tpitg[[thread_position_in_threadgroup]],
														
 
															-        uint2  tptg[[threads_per_threadgroup]]) {
														
 
															+        uint tiisg[[thread_index_in_simdgroup]],
														
 
															+        uint sgitg[[simdgroup_index_in_threadgroup]]) {
														
 
															     const uint8_t kmask1 = 0x03;
														
 
															     const uint8_t kmask2 = 0x0C;
														
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
 
															     const int64_t r0 = tgpig.x;
														
 
															     const int64_t r1 = tgpig.y;
														
 
															-    device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
														
 
															-    device const float     * yy = (device const float      *) src1 + r1*ne10;
														
 
															+    const int row = 2 * r0 + sgitg;
														
 
															-    const int nth = tptg.x*tptg.y;
														
 
															-    const int ith = tptg.y*tpitg.x + tpitg.y;
														
 
															+    device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb;
														
 
															+    device const float     * yy = (device const float      *) src1 + r1*ne10;
														
 
															     float sumf = 0;
														
 
															 #if QK_K == 256
														
 
															-    // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
														
 
															-    const int iqs  = 16 * tpitg.y;
														
 
															-    const int ip   = iqs / 128;         // 0 or 1
														
 
															-    const int il   = (iqs - 128*ip)/16; // 0...7
														
 
															+    const int tid  = tiisg/2;
														
 
															+    const int ix   = tiisg%2;
														
 
															+    const int ip   = tid/8;         // 0 or 1
														
 
															+    const int il   = tid%8;
														
 
															     const int n    = 4;
														
 
															     const int l0   = n*il;
														
 
															     const int is   = 8*ip + l0/16;
														
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
 
															     const int q_offset_l = 64*ip + l0;
														
 
															     const int q_offset_h = 32*ip + l0;
														
 
															-    for (int i = tpitg.x; i < nb; i += tptg.x) {
														
 
															+    for (int i = ix; i < nb; i += 2) {
														
 
															-        device const uint8_t * ql = x[i].ql + q_offset_l;
														
 
															+        device const uint8_t * q1 = x[i].ql + q_offset_l;
														
 
															+        device const uint8_t * q2 = q1 + 32;
														
 
															         device const uint8_t * qh = x[i].qh + q_offset_h;
														
 
															         device const int8_t  * sc = x[i].scales + is;
														
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
 
															         float4 sums = {0.f, 0.f, 0.f, 0.f};
														
 
															         for (int l = 0; l < n; ++l) {
														
 
															-            sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
														
 
															-            sums[1] += y[l+32] * ((int8_t)((ql[l+32] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
														
 
															-            sums[2] += y[l+64] * ((int8_t)((ql[l+ 0]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
														
 
															-            sums[3] += y[l+96] * ((int8_t)((ql[l+32]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
														
 
															+            sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
														
 
															+            sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
														
 
															+            sums[2] += y[l+64] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
														
 
															+            sums[3] += y[l+96] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
														
 
															         }
														
 
															         sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
														
 
															     }
														
 
															+
														
 
															 #else
														
 
															-    const int il  = 4*tpitg.x;    // 0, 4, 8, 12
														
 
															+    const int ix  = tiisg/4;
														
 
															+    const int il  = 4*(tiisg%4);
														
 
															-    for (int i = tpitg.y; i < nb; i += tptg.y) {
														
 
															+    for (int i = ix; i < nb; i += 8) {
														
 
															         device const float * y = yy + i * QK_K + il;
														
 
															         device const uint8_t * ql = x[i].ql + il;
														
 
															         device const uint8_t * qh = x[i].qh + il;
														
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
 
															 #endif
														
 
															-    sum[ith] = sumf;
														
 
															-
														
 
															-    //
														
 
															-    // Accumulate the sum from all threads in the threadgroup
														
 
															-    //
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%4 == 0) {
														
 
															-        for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
														
 
															+    const float tot = simd_sum(sumf);
														
 
															+    if (tiisg == 0) {
														
 
															+        dst[r1*ne0 + row] = tot;
														
 
															     }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith%16 == 0) {
														
 
															-        for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
														
 
															-    }
														
 
															-    threadgroup_barrier(mem_flags::mem_threadgroup);
														
 
															-    if (ith == 0) {
														
 
															-        for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
														
 
															-        dst[r1*ne0 + r0] = sum[0];
														
 
															-    }
														
 
															-
														
 
															 }
														
--- a/llama/ggml.c
+++ b/llama/ggml.c
--- a/llama/ggml.h
+++ b/llama/ggml.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -227,8 +227,13 @@
 
															 #define GGML_MAX_NAME          48
														
 
															 #define GGML_DEFAULT_N_THREADS 4
														
 
															+
														
 
															+#define GGML_EXIT_SUCCESS 0
														
 
															+#define GGML_EXIT_ABORTED 1
														
 
															+
														
 
															 #define GGML_UNUSED(x) (void)(x)
														
 
															+
														
 
															 #define GGML_ASSERT(x) \
														
 
															     do { \
														
 
															         if (!(x)) { \
														
@@ -389,6 +394,8 @@ extern "C" {
 
															         GGML_OP_CLAMP,
														
 
															         GGML_OP_CONV_1D,
														
 
															         GGML_OP_CONV_2D,
														
 
															+        GGML_OP_POOL_1D,
														
 
															+        GGML_OP_POOL_2D,
														
 
															         GGML_OP_FLASH_ATTN,
														
 
															         GGML_OP_FLASH_FF,
														
@@ -468,6 +475,10 @@ extern "C" {
 
															         // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
														
 
															         int n_tasks[GGML_MAX_NODES];
														
 
															+
														
 
															+        // abort ggml_graph_compute when true
														
 
															+        bool (*abort_callback)(void * data);
														
 
															+        void * abort_callback_data;
														
 
															     };
														
 
															     // computation graph
														
@@ -1136,6 +1147,17 @@ extern "C" {
 
															             int                   mode,
														
 
															             int                   n_ctx);
														
 
															+    // custom RoPE, in-place, returns view(a)
														
 
															+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            int                   n_past,
														
 
															+            int                   n_dims,
														
 
															+            int                   mode,
														
 
															+            float                 freq_base,
														
 
															+            float                 freq_scale,
														
 
															+            int                   n_ctx);
														
 
															+
														
 
															     // rotary position embedding backward, i.e compute dx from dy
														
 
															     // a - dy
														
 
															     GGML_API struct ggml_tensor * ggml_rope_back(
														
@@ -1190,6 +1212,31 @@ extern "C" {
 
															             int                   s,
														
 
															             int                   d);
														
 
															+    enum ggml_op_pool {
														
 
															+        GGML_OP_POOL_MAX,
														
 
															+        GGML_OP_POOL_AVG,
														
 
															+        GGML_OP_POOL_COUNT,
														
 
															+    };
														
 
															+
														
 
															+    GGML_API struct ggml_tensor* ggml_pool_1d(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            enum ggml_op_pool     op,
														
 
															+            int                   k0, // kernel size
														
 
															+            int                   s0, // stride
														
 
															+            int                   p0); // padding
														
 
															+
														
 
															+    GGML_API struct ggml_tensor* ggml_pool_2d(
														
 
															+            struct ggml_context * ctx,
														
 
															+            struct ggml_tensor  * a,
														
 
															+            enum ggml_op_pool     op,
														
 
															+            int                   k0,
														
 
															+            int                   k1,
														
 
															+            int                   s0,
														
 
															+            int                   s1,
														
 
															+            int                   p0,
														
 
															+            int                   p1);
														
 
															+
														
 
															     GGML_API struct ggml_tensor * ggml_flash_attn(
														
 
															             struct ggml_context * ctx,
														
 
															             struct ggml_tensor  * q,
														
@@ -1329,7 +1376,7 @@ extern "C" {
 
															     // ggml_graph_plan() has to be called before ggml_graph_compute()
														
 
															     // when plan.work_size > 0, caller must allocate memory for plan.work_data
														
 
															     GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
														
 
															-    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
														
 
															+    GGML_API               int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
														
 
															     GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
														
 
															     // same as ggml_graph_compute() but the work data is allocated as a part of the context
														
--- a/llama/k_quants.c
+++ b/llama/k_quants.c
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
--- a/llama/k_quants.h
+++ b/llama/k_quants.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -41,6 +41,14 @@
 
															 #define K_SCALE_SIZE 12
														
 
															 #endif
														
 
															+#ifndef static_assert
														
 
															+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
														
 
															+#define static_assert(cond, msg) _Static_assert(cond, msg)
														
 
															+#else
														
 
															+#define static_assert(cond, msg) struct global_scope_noop_trick
														
 
															+#endif
														
 
															+#endif
														
 
															+
														
 
															 //
														
 
															 // Super-block quantization structures
														
 
															 //
														
--- a/llama/llama-util.h
+++ b/llama/llama-util.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -201,13 +201,13 @@ struct llama_mmap {
 
															     llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
														
 
															         size = file->size;
														
 
															         int fd = fileno(file->fp);
														
 
															-        int flags = MAP_PRIVATE;
														
 
															+        int flags = MAP_SHARED;
														
 
															         // prefetch/readahead impairs performance on NUMA systems
														
 
															         if (numa) { prefetch = 0; }
														
 
															 #ifdef __linux__
														
 
															         if (prefetch) { flags |= MAP_POPULATE; }
														
 
															 #endif
														
 
															-        addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
														
 
															+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
														
 
															         if (addr == MAP_FAILED) {
														
 
															             throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
														
 
															         }
														
@@ -249,7 +249,7 @@ struct llama_mmap {
 
															             throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
														
 
															         }
														
 
															-        addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
														
 
															+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
														
 
															         error = GetLastError();
														
 
															         CloseHandle(hMapping);
														
--- a/llama/llama.cpp
+++ b/llama/llama.cpp
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -127,14 +127,15 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 
															 // memory sizes
														
 
															 //
														
 
															-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
														
 
															+static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
														
 
															 {
														
 
															     static std::map<e_model, size_t> k_sizes = {
														
 
															-        { MODEL_3B,    256ull * MB },
														
 
															-        { MODEL_7B,    512ull * MB },
														
 
															-        { MODEL_13B,   512ull * MB },
														
 
															-        { MODEL_30B,   512ull * MB },
														
 
															-        { MODEL_65B,  1024ull * MB },
														
 
															+        /* empirical scaling, still a guess */
														
 
															+        { MODEL_3B,   ((size_t) n_ctx / 16ull + 128ull) * MB },
														
 
															+        { MODEL_7B,   ((size_t) n_ctx / 16ull + 256ull) * MB },
														
 
															+        { MODEL_13B,  ((size_t) n_ctx / 12ull + 256ull) * MB },
														
 
															+        { MODEL_30B,  ((size_t) n_ctx / 10ull + 256ull) * MB },
														
 
															+        { MODEL_65B,  ((size_t) n_ctx /  8ull + 512ull) * MB },
														
 
															     };
														
 
															     return k_sizes;
														
 
															 }
														
@@ -166,14 +167,14 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 
															 // this is mostly needed for temporary mul_mat buffers to dequantize the data
														
 
															 // not actually needed if BLAS is disabled
														
 
															-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
														
 
															+static const std::map<e_model, size_t> & MEM_REQ_EVAL(int n_ctx)
														
 
															 {
														
 
															     static std::map<e_model, size_t> k_sizes = {
														
 
															-        { MODEL_3B,   512ull * MB },
														
 
															-        { MODEL_7B,   768ull * MB },
														
 
															-        { MODEL_13B, 1024ull * MB },
														
 
															-        { MODEL_30B, 1280ull * MB },
														
 
															-        { MODEL_65B, 1536ull * MB },
														
 
															+        { MODEL_3B,  ((size_t) n_ctx / 256ull +  512ull) * MB },
														
 
															+        { MODEL_7B,  ((size_t) n_ctx / 256ull +  768ull) * MB },
														
 
															+        { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB },
														
 
															+        { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB },
														
 
															+        { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB },
														
 
															     };
														
 
															     return k_sizes;
														
 
															 }
														
@@ -215,6 +216,10 @@ struct llama_hparams {
 
															     uint32_t n_head  = 32;
														
 
															     uint32_t n_layer = 32;
														
 
															     uint32_t n_rot   = 64;
														
 
															+
														
 
															+    float rope_freq_base  = 10000.0f;
														
 
															+    float rope_freq_scale = 1.0f;
														
 
															+
														
 
															     enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
														
 
															     bool operator!=(const llama_hparams & other) const {
														
@@ -329,7 +334,7 @@ struct llama_model {
 
															 };
														
 
															 struct llama_context {
														
 
															-    llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
														
 
															+    llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
														
 
															 #ifdef GGML_USE_METAL
														
 
															     ~llama_context() {
														
 
															         if (ctx_metal) {
														
@@ -350,7 +355,6 @@ struct llama_context {
 
															     int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
														
 
															     const llama_model & model;
														
 
															-    const llama_vocab & vocab;
														
 
															     bool model_owner = false;
														
@@ -577,7 +581,9 @@ struct llama_file_loader {
 
															             }
														
 
															             // skip to the next multiple of 32 bytes
														
 
															-            file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
														
 
															+            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
														
 
															+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
														
 
															+            }
														
 
															             tensor.file_off = file.tell();
														
 
															             tensor.name = name;
														
@@ -674,7 +680,7 @@ struct llama_model_loader {
 
															         *ctx_size_p = *mmapped_size_p = 0;
														
 
															         for (const llama_load_tensor & lt : tensors_map.tensors) {
														
 
															             *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
														
 
															-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
														
 
															+            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size + 16;
														
 
															         }
														
 
															     }
														
@@ -870,6 +876,8 @@ struct llama_context_params llama_context_default_params() {
 
															         /*.gpu_layers                  =*/ 0,
														
 
															         /*.main_gpu                    =*/ 0,
														
 
															         /*.tensor_split                =*/ {0},
														
 
															+        /*.rope_freq_base              =*/ 10000.0f,
														
 
															+        /*.rope_freq_scale             =*/ 1.0f,
														
 
															         /*.progress_callback           =*/ nullptr,
														
 
															         /*.progress_callback_user_data =*/ nullptr,
														
 
															         /*.low_vram                    =*/ false,
														
@@ -895,6 +903,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 
															     return result;
														
 
															 }
														
 
															+int llama_max_devices() {
														
 
															+    return LLAMA_MAX_DEVICES;
														
 
															+}
														
 
															+
														
 
															 bool llama_mmap_supported() {
														
 
															     return llama_mmap::SUPPORTED;
														
 
															 }
														
@@ -993,6 +1005,8 @@ static void llama_model_load_internal(
 
															         int n_gpu_layers,
														
 
															         int main_gpu,
														
 
															         const float * tensor_split,
														
 
															+        float rope_freq_base,
														
 
															+        float rope_freq_scale,
														
 
															         bool low_vram,
														
 
															         ggml_type memory_type,
														
 
															         bool use_mmap,
														
@@ -1027,22 +1041,27 @@ static void llama_model_load_internal(
 
															         }
														
 
															         hparams.n_ctx = n_ctx;
														
 
															+
														
 
															+        hparams.rope_freq_base  = rope_freq_base;
														
 
															+        hparams.rope_freq_scale = rope_freq_scale;
														
 
															     }
														
 
															     const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
														
 
															     {
														
 
															-        fprintf(stderr, "%s: format     = %s\n",  __func__, llama_file_version_name(file_version));
														
 
															-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
														
 
															-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
														
 
															-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
														
 
															-        fprintf(stderr, "%s: n_mult     = %u\n",  __func__, hparams.n_mult);
														
 
															-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
														
 
															-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
														
 
															-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
														
 
															+        fprintf(stderr, "%s: format     = %s\n",   __func__, llama_file_version_name(file_version));
														
 
															+        fprintf(stderr, "%s: n_vocab    = %u\n",   __func__, hparams.n_vocab);
														
 
															+        fprintf(stderr, "%s: n_ctx      = %u\n",   __func__, hparams.n_ctx);
														
 
															+        fprintf(stderr, "%s: n_embd     = %u\n",   __func__, hparams.n_embd);
														
 
															+        fprintf(stderr, "%s: n_mult     = %u\n",   __func__, hparams.n_mult);
														
 
															+        fprintf(stderr, "%s: n_head     = %u\n",   __func__, hparams.n_head);
														
 
															+        fprintf(stderr, "%s: n_layer    = %u\n",   __func__, hparams.n_layer);
														
 
															+        fprintf(stderr, "%s: n_rot      = %u\n",   __func__, hparams.n_rot);
														
 
															+        fprintf(stderr, "%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
														
 
															+        fprintf(stderr, "%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
														
 
															         fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
														
 
															-        fprintf(stderr, "%s: n_ff       = %u\n",  __func__, n_ff);
														
 
															-        fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
														
 
															+        fprintf(stderr, "%s: n_ff       = %u\n",   __func__, n_ff);
														
 
															+        fprintf(stderr, "%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
														
 
															     }
														
 
															     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
														
@@ -1191,9 +1210,9 @@ static void llama_model_load_internal(
 
															         const size_t mem_required =
														
 
															             ctx_size +
														
 
															             mmapped_size - vram_weights + // weights in VRAM not in memory
														
 
															-            MEM_REQ_SCRATCH0().at(model.type) +
														
 
															+            MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
														
 
															             MEM_REQ_SCRATCH1().at(model.type) +
														
 
															-            MEM_REQ_EVAL().at    (model.type);
														
 
															+            MEM_REQ_EVAL(hparams.n_ctx).at(model.type);
														
 
															         // this is the memory required by one llama_state
														
 
															         const size_t mem_required_state =
														
@@ -1297,6 +1316,8 @@ static bool llama_model_load(
 
															         int n_gpu_layers,
														
 
															         int main_gpu,
														
 
															         float * tensor_split,
														
 
															+        float rope_freq_base,
														
 
															+        float rope_freq_scale,
														
 
															         bool low_vram,
														
 
															         ggml_type memory_type,
														
 
															         bool use_mmap,
														
@@ -1305,7 +1326,7 @@ static bool llama_model_load(
 
															         llama_progress_callback progress_callback,
														
 
															         void *progress_callback_user_data) {
														
 
															     try {
														
 
															-        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
														
 
															+        llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
														
 
															                                   use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
														
 
															         return true;
														
 
															     } catch (const std::exception & err) {
														
@@ -1357,6 +1378,9 @@ static bool llama_eval_internal(
 
															     const int n_rot        = hparams.n_embd/hparams.n_head;
														
 
															     const int n_gpu_layers = model.n_gpu_layers;
														
 
															+    const float freq_base  = hparams.rope_freq_base;
														
 
															+    const float freq_scale = hparams.rope_freq_scale;
														
 
															+
														
 
															     auto & mem_per_token = lctx.mem_per_token;
														
 
															     auto & buf_compute   = lctx.buf_compute;
														
@@ -1454,11 +1478,11 @@ static bool llama_eval_internal(
 
															             offload_func_kq(tmpq);
														
 
															             ggml_set_name(tmpq, "tmpq");
														
 
															-            struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
														
 
															+            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
														
 
															             offload_func_kq(Kcur);
														
 
															             ggml_set_name(Kcur, "Kcur");
														
 
															-            struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0);
														
 
															+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, freq_base, freq_scale, 0);
														
 
															             offload_func_kq(Qcur);
														
 
															             ggml_set_name(Qcur, "Qcur");
														
@@ -2032,9 +2056,18 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
 
															     }
														
 
															     // Normalize the second derivatives
														
 
															-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
														
 
															-    for (float & value : second_derivatives) {
														
 
															-        value /= second_derivatives_sum;
														
 
															+    {
														
 
															+        const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
														
 
															+
														
 
															+        if (second_derivatives_sum > 1e-6f) {
														
 
															+            for (float & value : second_derivatives) {
														
 
															+                value /= second_derivatives_sum;
														
 
															+            }
														
 
															+        } else {
														
 
															+            for (float & value : second_derivatives) {
														
 
															+                value = 1.0f / second_derivatives.size();
														
 
															+            }
														
 
															+        }
														
 
															     }
														
 
															     float cum_sum = 0.0f;
														
@@ -2213,7 +2246,7 @@ void llama_sample_classifier_free_guidance(
 
															           struct llama_context * guidance_ctx,
														
 
															                          float   scale,
														
 
															                          float   smooth_factor) {
														
 
															-    int64_t t_start_sample_us = t_start_sample_us = ggml_time_us();
														
 
															+    int64_t t_start_sample_us = ggml_time_us();
														
 
															     assert(ctx);
														
 
															     auto n_vocab = llama_n_vocab(ctx);
														
@@ -2701,8 +2734,9 @@ struct llama_model * llama_load_model_from_file(
 
															     ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
														
 
															     if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
														
 
															-                params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
														
 
															-                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
														
 
															+                params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
														
 
															+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
														
 
															+                params.progress_callback_user_data)) {
														
 
															         delete model;
														
 
															         fprintf(stderr, "%s: failed to load model\n", __func__);
														
 
															         return nullptr;
														
@@ -2723,7 +2757,7 @@ struct llama_context * llama_new_context_with_model(
 
															         return nullptr;
														
 
															     }
														
 
															-    llama_context * ctx = new llama_context(*model, model->vocab);
														
 
															+    llama_context * ctx = new llama_context(*model);
														
 
															     if (params.seed == LLAMA_DEFAULT_SEED) {
														
 
															         params.seed = time(NULL);
														
@@ -2777,9 +2811,9 @@ struct llama_context * llama_new_context_with_model(
 
															             ctx->embedding.resize(hparams.n_embd);
														
 
															         }
														
 
															-        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
														
 
															+        ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type));
														
 
															-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
														
 
															+        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
														
 
															         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
														
 
															     }
														
@@ -3561,13 +3595,13 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
 
															     return 0;
														
 
															 }
														
 
															-int llama_tokenize(
														
 
															-        struct llama_context * ctx,
														
 
															+int llama_tokenize_with_model(
														
 
															+    const struct llama_model * model,
														
 
															                   const char * text,
														
 
															                  llama_token * tokens,
														
 
															                          int   n_max_tokens,
														
 
															                         bool   add_bos) {
														
 
															-    auto res = llama_tokenize(ctx->vocab, text, add_bos);
														
 
															+    auto res = llama_tokenize(model->vocab, text, add_bos);
														
 
															     if (n_max_tokens < (int) res.size()) {
														
 
															         fprintf(stderr, "%s: too many tokens\n", __func__);
														
@@ -3581,8 +3615,29 @@ int llama_tokenize(
 
															     return res.size();
														
 
															 }
														
 
															+int llama_tokenize(
														
 
															+        struct llama_context * ctx,
														
 
															+                  const char * text,
														
 
															+                 llama_token * tokens,
														
 
															+                         int   n_max_tokens,
														
 
															+                        bool   add_bos) {
														
 
															+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
														
 
															+}
														
 
															+
														
 
															+int llama_n_vocab_from_model(const struct llama_model * model) {
														
 
															+    return model->vocab.id_to_token.size();
														
 
															+}
														
 
															+
														
 
															+int llama_n_ctx_from_model(const struct llama_model * model) {
														
 
															+    return model->hparams.n_ctx;
														
 
															+}
														
 
															+
														
 
															+int llama_n_embd_from_model(const struct llama_model * model) {
														
 
															+    return model->hparams.n_embd;
														
 
															+}
														
 
															+
														
 
															 int llama_n_vocab(const struct llama_context * ctx) {
														
 
															-    return ctx->vocab.id_to_token.size();
														
 
															+    return ctx->model.vocab.id_to_token.size();
														
 
															 }
														
 
															 int llama_n_ctx(const struct llama_context * ctx) {
														
@@ -3593,19 +3648,27 @@ int llama_n_embd(const struct llama_context * ctx) {
 
															     return ctx->model.hparams.n_embd;
														
 
															 }
														
 
															-int llama_get_vocab(
														
 
															-        const struct llama_context * ctx,
														
 
															+int llama_get_vocab_from_model(
														
 
															+        const struct llama_model * model,
														
 
															         const char * * strings,
														
 
															         float  * scores,
														
 
															         int capacity) {
														
 
															-    int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
														
 
															+    int n = std::min(capacity, (int) model->vocab.id_to_token.size());
														
 
															     for (int i = 0; i<n; ++i) {
														
 
															-        strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
														
 
															-        scores[i]  = ctx->vocab.id_to_token[i].score;
														
 
															+        strings[i] = model->vocab.id_to_token[i].tok.c_str();
														
 
															+        scores[i]  = model->vocab.id_to_token[i].score;
														
 
															     }
														
 
															     return n;
														
 
															 }
														
 
															+int llama_get_vocab(
														
 
															+        const struct llama_context * ctx,
														
 
															+        const char * * strings,
														
 
															+        float  * scores,
														
 
															+        int capacity) {
														
 
															+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
														
 
															+}
														
 
															+
														
 
															 float * llama_get_logits(struct llama_context * ctx) {
														
 
															     return ctx->logits.data();
														
 
															 }
														
@@ -3614,12 +3677,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
 
															     return ctx->embedding.data();
														
 
															 }
														
 
															-const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
														
 
															-    if (token >= llama_n_vocab(ctx)) {
														
 
															+const char * llama_token_to_str_with_model(const struct llama_model * model, llama_token token) {
														
 
															+    if (token >= llama_n_vocab_from_model(model)) {
														
 
															         return nullptr;
														
 
															     }
														
 
															-    return ctx->vocab.id_to_token[token].tok.c_str();
														
 
															+    return model->vocab.id_to_token[token].tok.c_str();
														
 
															+}
														
 
															+
														
 
															+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
														
 
															+    return llama_token_to_str_with_model(&ctx->model, token);
														
 
															 }
														
 
															 llama_token llama_token_bos() {
														
--- a/llama/llama.h
+++ b/llama/llama.h
@@ -1,5 +1,5 @@
 
															 /**
														
 
															- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
														
 
															+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
														
 
															  *
														
 
															  * MIT License
														
 
															  *
														
@@ -115,6 +115,11 @@ extern "C" {
 
															         int32_t  n_gpu_layers;                 // number of layers to store in VRAM
														
 
															         int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
														
 
															         float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
														
 
															+
														
 
															+        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
														
 
															+        float    rope_freq_base;  // RoPE base frequency
														
 
															+        float    rope_freq_scale; // RoPE frequency scaling factor
														
 
															+
														
 
															         // called with a progress value between 0 and 1, pass NULL to disable
														
 
															         llama_progress_callback progress_callback;
														
 
															         // context pointer passed to the progress callback
														
@@ -174,6 +179,8 @@ extern "C" {
 
															         int32_t n_eval;
														
 
															     };
														
 
															+    LLAMA_API int llama_max_devices();
														
 
															+
														
 
															     LLAMA_API struct llama_context_params llama_context_default_params();
														
 
															     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
														
@@ -296,10 +303,21 @@ extern "C" {
 
															                              int   n_max_tokens,
														
 
															                             bool   add_bos);
														
 
															+    LLAMA_API int llama_tokenize_with_model(
														
 
															+        const struct llama_model * model,
														
 
															+                      const char * text,
														
 
															+                     llama_token * tokens,
														
 
															+                             int   n_max_tokens,
														
 
															+                            bool   add_bos);
														
 
															+
														
 
															     LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
														
 
															     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
														
 
															     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
														
 
															+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
														
 
															+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
														
 
															+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
														
 
															+
														
 
															     // Get the vocabulary as output parameters.
														
 
															     // Returns number of results.
														
 
															     LLAMA_API int llama_get_vocab(
														
@@ -308,6 +326,12 @@ extern "C" {
 
															                                  float * scores,
														
 
															                                    int   capacity);
														
 
															+    LLAMA_API int llama_get_vocab_from_model(
														
 
															+              const struct llama_model * model,
														
 
															+                          const char * * strings,
														
 
															+                                 float * scores,
														
 
															+                                   int   capacity);
														
 
															+
														
 
															     // Token logits obtained from the last call to llama_eval()
														
 
															     // The logits for the last token are stored in the last row
														
 
															     // Can be mutated in order to change the probabilities of the next token
														
@@ -320,7 +344,13 @@ extern "C" {
 
															     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
														
 
															     // Token Id -> String. Uses the vocabulary in the provided context
														
 
															-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
														
 
															+    LLAMA_API const char * llama_token_to_str(
														
 
															+            const struct llama_context * ctx,
														
 
															+                           llama_token   token);
														
 
															+
														
 
															+    LLAMA_API const char * llama_token_to_str_with_model(
														
 
															+              const struct llama_model * model,
														
 
															+                           llama_token   token);
														
 
															     // Special tokens
														
 
															     LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
														
--- a/llama/update-llama-cpp.sh
+++ b/llama/update-llama-cpp.sh
@@ -0,0 +1,61 @@
 
															+#!/bin/sh
														
 
															+
														
 
															+set -eu
														
 
															+
														
 
															+
														
 
															+status() { echo >&2 ">>> $*"; }
														
 
															+error() { status "ERROR $*"; }
														
 
															+usage() {
														
 
															+    echo "usage: $(basename $0) /path/to/repo"
														
 
															+    exit 1
														
 
															+}
														
 
															+
														
 
															+OUT=$(dirname $0)
														
 
															+while getopts "hC:" OPTION; do
														
 
															+    case $OPTION in
														
 
															+        C) OUT=$OPTARG ;;
														
 
															+        *) usage ;;
														
 
															+    esac
														
 
															+done
														
 
															+
														
 
															+shift $(( $OPTIND - 1 ))
														
 
															+[ $# -eq 1 ] || usage
														
 
															+
														
 
															+status "updating source..."
														
 
															+cp -a "$1"/*.{c,h,cpp,m,metal,cu} "$OUT"
														
 
															+
														
 
															+status "removing incompatible files..."
														
 
															+rm -f "$OUT"/build-info.h
														
 
															+rm -f "$OUT"/ggml-{mpi,opencl}.*
														
 
															+
														
 
															+SHA1=$(git -C $1 rev-parse @)
														
 
															+
														
 
															+LICENSE=$(mktemp)
														
 
															+cleanup() {
														
 
															+    rm -f $LICENSE
														
 
															+}
														
 
															+trap cleanup 0
														
 
															+
														
 
															+cat <<EOF | sed 's/ *$//' >$LICENSE
														
 
															+/**
														
 
															+ * llama.cpp - git $SHA1
														
 
															+ *
														
 
															+$(sed 's/^/ * /' <$1/LICENSE)
														
 
															+ */
														
 
															+
														
 
															+EOF
														
 
															+
														
 
															+for f in $OUT/*.{c,h,cpp,m,metal,cu}; do
														
 
															+    TMP=$(mktemp)
														
 
															+    status "updating license: $f"
														
 
															+    cat $LICENSE $f >$TMP
														
 
															+    mv $TMP $f
														
 
															+done
														
 
															+
														
 
															+status "touching up MacOS files..."
														
 
															+TMP=$(mktemp)
														
 
															+{
														
 
															+    echo "// +build darwin"
														
 
															+    echo
														
 
															+} | cat - $OUT/ggml-metal.m >$TMP
														
 
															+mv $TMP $OUT/ggml-metal.m