|
@@ -1,5 +1,5 @@
|
|
/**
|
|
/**
|
|
- * llama.cpp - git 5bf2a2771886ee86137e01dbc7492f78fb392066
|
|
|
|
|
|
+ * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
|
|
*
|
|
*
|
|
* MIT License
|
|
* MIT License
|
|
*
|
|
*
|
|
@@ -357,26 +357,33 @@ kernel void kernel_rms_norm(
|
|
threadgroup float * sum [[threadgroup(0)]],
|
|
threadgroup float * sum [[threadgroup(0)]],
|
|
uint tgpig[[threadgroup_position_in_grid]],
|
|
uint tgpig[[threadgroup_position_in_grid]],
|
|
uint tpitg[[thread_position_in_threadgroup]],
|
|
uint tpitg[[thread_position_in_threadgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]],
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
uint ntg[[threads_per_threadgroup]]) {
|
|
uint ntg[[threads_per_threadgroup]]) {
|
|
- device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
|
|
|
|
|
|
+ device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
|
|
|
+ device const float * x_scalar = (device const float *) x;
|
|
|
|
+ float4 sumf=0;
|
|
|
|
+ float all_sum=0;
|
|
|
|
|
|
// parallel sum
|
|
// parallel sum
|
|
- sum[tpitg] = 0.0f;
|
|
|
|
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
|
|
- sum[tpitg] += x[i00] * x[i00];
|
|
|
|
|
|
+ for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
|
|
+ sumf += x[i00] * x[i00];
|
|
|
|
+ }
|
|
|
|
+ all_sum = sumf[0] + sumf[1] + sumf[2] + sumf[3];
|
|
|
|
+ all_sum = simd_sum(all_sum);
|
|
|
|
+ if (tiisg == 0) {
|
|
|
|
+ sum[sgitg] = all_sum;
|
|
}
|
|
}
|
|
|
|
|
|
- // reduce
|
|
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
- for (uint i = ntg/2; i > 0; i /= 2) {
|
|
|
|
- if (tpitg < i) {
|
|
|
|
- sum[tpitg] += sum[tpitg + i];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
|
|
+ // broadcast, simd group number is ntg / 32
|
|
|
|
+ for (int i = ntg / 32 / 2; i > 0; i /= 2) {
|
|
|
|
+ if (tpitg < i) {
|
|
|
|
+ sum[tpitg] += sum[tpitg + i];
|
|
|
|
+ }
|
|
}
|
|
}
|
|
-
|
|
|
|
- // broadcast
|
|
|
|
if (tpitg == 0) {
|
|
if (tpitg == 0) {
|
|
|
|
+ for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
|
|
sum[0] /= ne00;
|
|
sum[0] /= ne00;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -385,147 +392,127 @@ kernel void kernel_rms_norm(
|
|
const float mean = sum[0];
|
|
const float mean = sum[0];
|
|
const float scale = 1.0f/sqrt(mean + eps);
|
|
const float scale = 1.0f/sqrt(mean + eps);
|
|
|
|
|
|
- device float * y = dst + tgpig*ne00;
|
|
|
|
- for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
|
|
|
|
|
|
+ device float4 * y = (device float4 *) (dst + tgpig*ne00);
|
|
|
|
+ device float * y_scalar = (device float *) y;
|
|
|
|
+ for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
y[i00] = x[i00] * scale;
|
|
y[i00] = x[i00] * scale;
|
|
}
|
|
}
|
|
|
|
+ if (tpitg == 0) {
|
|
|
|
+ for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
-kernel void kernel_mul_mat_q4_0_f32(
|
|
|
|
- device const void * src0,
|
|
|
|
- device const float * src1,
|
|
|
|
- device float * dst,
|
|
|
|
- constant int64_t & ne00,
|
|
|
|
- constant int64_t & ne10,
|
|
|
|
- constant int64_t & ne0,
|
|
|
|
- threadgroup float * sum [[threadgroup(0)]],
|
|
|
|
- uint2 tgpig[[threadgroup_position_in_grid]],
|
|
|
|
- uint2 tpitg[[thread_position_in_threadgroup]],
|
|
|
|
- uint2 tptg[[threads_per_threadgroup]]) {
|
|
|
|
- const int nb = ne00/QK4_0;
|
|
|
|
|
|
+// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
|
|
|
|
+float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
|
|
|
|
+ float d = qb_curr->d;
|
|
|
|
+ float4 acc = 0.f;
|
|
|
|
+ device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
|
|
|
|
+ for (int i = 0; i < 16; i+=2) {
|
|
|
|
+ acc[0] += yl[i] * (qs[i / 2] & 0x000F);
|
|
|
|
+ acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
|
|
|
|
+ acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
|
|
|
|
+ acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
|
|
|
|
+ }
|
|
|
|
+ return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
|
|
|
|
+}
|
|
|
|
|
|
- const int64_t r0 = tgpig.x;
|
|
|
|
- const int64_t r1 = tgpig.y;
|
|
|
|
|
|
+// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
|
|
|
|
+float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
|
|
|
|
+ float d = qb_curr->d;
|
|
|
|
+ float m = qb_curr->m;
|
|
|
|
+ float4 acc = 0.f;
|
|
|
|
+ device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
|
|
|
|
+ for (int i = 0; i < 16; i+=2) {
|
|
|
|
+ acc[0] += yl[i] * (qs[i / 2] & 0x000F);
|
|
|
|
+ acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
|
|
|
|
+ acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
|
|
|
|
+ acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
|
|
|
|
+ }
|
|
|
|
+ return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
|
|
|
|
+}
|
|
|
|
|
|
- device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
|
|
|
|
|
+// putting them in the kernel cause a significant performance penalty
|
|
|
|
+#define N_DST 4 // each SIMD group works on 4 rows
|
|
|
|
+#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
|
|
|
+#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
|
|
|
+template<typename block_q_type>
|
|
|
|
+void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
|
|
|
|
+ int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
|
|
|
|
+ uint2 tgpig, uint tiisg, uint sgitg) {
|
|
|
|
+ const int nb = ne00/QK4_0;
|
|
|
|
+ const int r0 = tgpig.x;
|
|
|
|
+ const int r1 = tgpig.y;
|
|
|
|
+ device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
|
device const float * y = (device const float *) src1 + r1*ne10;
|
|
device const float * y = (device const float *) src1 + r1*ne10;
|
|
-
|
|
|
|
- const int nth = tptg.x*tptg.y;
|
|
|
|
- const int ith = tptg.y*tpitg.x + tpitg.y;
|
|
|
|
-
|
|
|
|
- const int ix = tpitg.y/4; // 0 or 1
|
|
|
|
- const int iy = tpitg.y - 4*ix; // 0...3
|
|
|
|
-
|
|
|
|
- const int first = 4 * iy;
|
|
|
|
-
|
|
|
|
- float sumf = 0;
|
|
|
|
-
|
|
|
|
- for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
|
|
|
-
|
|
|
|
- const float d = (float)x[i].d;
|
|
|
|
-
|
|
|
|
- device const uint8_t * xl = x[i].qs + first;
|
|
|
|
- device const float * yl = y + i * QK4_0 + first;
|
|
|
|
-
|
|
|
|
- float2 acc = {0.0f, 0.0f};
|
|
|
|
-
|
|
|
|
- for (int j = 0; j < 4; ++j) {
|
|
|
|
-
|
|
|
|
- acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
|
|
|
|
- acc[1] += yl[j] + yl[j+16];
|
|
|
|
-
|
|
|
|
|
|
+ float4 y_curr[8]; // src1 vector cache
|
|
|
|
+ float sumf[N_DST]={0.f}, all_sum;
|
|
|
|
+ thread float * yl=(thread float *)y_curr;
|
|
|
|
+
|
|
|
|
+ // each thread in a SIMD group deals with 1 block.
|
|
|
|
+ for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
|
|
|
+ float sumy = 0;
|
|
|
|
+ for (int i = 0; i < QK4_0 / 4; i++) {
|
|
|
|
+ y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
|
|
|
|
+ sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
|
}
|
|
}
|
|
|
|
|
|
- sumf += d * (acc[0] - 8.f*acc[1]);
|
|
|
|
|
|
+ for (int row = 0; row < N_DST; row++) {
|
|
|
|
+ sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
- sum[ith] = sumf;
|
|
|
|
|
|
+ // from now loads two rows every time and 16 blocks per row
|
|
|
|
+ int ir = tiisg / (N_SIMDWIDTH / 2);
|
|
|
|
+ int ib = tiisg % (N_SIMDWIDTH / 2);
|
|
|
|
+ for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
|
|
|
|
+ int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
|
|
|
|
+ float sumy = 0;
|
|
|
|
+ for (int i = 0; i < QK4_0 / 4; i++) {
|
|
|
|
+ y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
|
|
|
|
+ sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
|
|
|
+ }
|
|
|
|
|
|
- //
|
|
|
|
- // Accumulate the sum from all threads in the threadgroup
|
|
|
|
- //
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%4 == 0) {
|
|
|
|
- sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%16 == 0) {
|
|
|
|
- sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
|
|
|
|
|
+ for (int row = 0; row < N_DST; row+=2) {
|
|
|
|
+ if (nb_start + ib < nb) {
|
|
|
|
+ sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith == 0) {
|
|
|
|
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
|
|
- dst[r1*ne0 + r0] = sum[0];
|
|
|
|
|
|
+
|
|
|
|
+ for (int row = 0; row < N_DST; ++row) {
|
|
|
|
+ all_sum = simd_sum(sumf[row]);
|
|
|
|
+ if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
|
|
|
+ dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-kernel void kernel_mul_mat_q4_1_f32(
|
|
|
|
|
|
+kernel void kernel_mul_mat_q4_0_f32(
|
|
device const void * src0,
|
|
device const void * src0,
|
|
device const float * src1,
|
|
device const float * src1,
|
|
device float * dst,
|
|
device float * dst,
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne0,
|
|
constant int64_t & ne0,
|
|
- threadgroup float * sum [[threadgroup(0)]],
|
|
|
|
|
|
+ constant int64_t & ne01[[buffer(4)]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
- uint2 tpitg[[thread_position_in_threadgroup]],
|
|
|
|
- uint2 tptg[[threads_per_threadgroup]]) {
|
|
|
|
- const int nb = ne00/QK4_1;
|
|
|
|
-
|
|
|
|
- const int64_t r0 = tgpig.x;
|
|
|
|
- const int64_t r1 = tgpig.y;
|
|
|
|
-
|
|
|
|
- device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
|
|
|
|
- device const float * y = (device const float *) src1 + r1*ne10;
|
|
|
|
-
|
|
|
|
- const uint nth = tptg.x*tptg.y;
|
|
|
|
- const uint ith = tptg.y*tpitg.x + tpitg.y;
|
|
|
|
-
|
|
|
|
- const int ix = tpitg.y/4; // 0 or 1
|
|
|
|
- const int iy = tpitg.y - 4*ix; // 0...3
|
|
|
|
-
|
|
|
|
- const int first = 4 * iy;
|
|
|
|
-
|
|
|
|
- float sumf = 0;
|
|
|
|
-
|
|
|
|
- for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
|
|
|
-
|
|
|
|
- const float d = (float)x[i].d;
|
|
|
|
- const float m = (float)x[i].m;
|
|
|
|
-
|
|
|
|
- device const uint8_t * xl = x[i].qs + first;
|
|
|
|
- device const float * yl = y + i * QK4_1 + first;
|
|
|
|
-
|
|
|
|
- float2 acc = {0.0f, 0.0f};
|
|
|
|
-
|
|
|
|
- for (int j = 0; j < 4; ++j) {
|
|
|
|
-
|
|
|
|
- acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
|
|
|
|
- acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
|
|
|
|
-
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- sumf += acc[0] + acc[1];
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- sum[ith] = sumf;
|
|
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
+ mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
|
|
|
+}
|
|
|
|
|
|
- //
|
|
|
|
- // Accumulate the sum from all threads in the threadgroup
|
|
|
|
- //
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%4 == 0) {
|
|
|
|
- sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%16 == 0) {
|
|
|
|
- sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith == 0) {
|
|
|
|
- for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
|
|
- dst[r1*ne0 + r0] = sum[0];
|
|
|
|
- }
|
|
|
|
|
|
+kernel void kernel_mul_mat_q4_1_f32(
|
|
|
|
+ device const void * src0,
|
|
|
|
+ device const float * src1,
|
|
|
|
+ device float * dst,
|
|
|
|
+ constant int64_t & ne00,
|
|
|
|
+ constant int64_t & ne10,
|
|
|
|
+ constant int64_t & ne0,
|
|
|
|
+ constant int64_t & ne01[[buffer(4)]],
|
|
|
|
+ uint2 tgpig[[threadgroup_position_in_grid]],
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
+ mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
|
}
|
|
}
|
|
|
|
|
|
kernel void kernel_mul_mat_f16_f32(
|
|
kernel void kernel_mul_mat_f16_f32(
|
|
@@ -641,17 +628,19 @@ kernel void kernel_rope(
|
|
constant int & n_past,
|
|
constant int & n_past,
|
|
constant int & n_dims,
|
|
constant int & n_dims,
|
|
constant int & mode,
|
|
constant int & mode,
|
|
|
|
+ constant float & freq_base,
|
|
|
|
+ constant float & freq_scale,
|
|
uint3 tpig[[thread_position_in_grid]]) {
|
|
uint3 tpig[[thread_position_in_grid]]) {
|
|
const int64_t i3 = tpig[2];
|
|
const int64_t i3 = tpig[2];
|
|
const int64_t i2 = tpig[1];
|
|
const int64_t i2 = tpig[1];
|
|
const int64_t i1 = tpig[0];
|
|
const int64_t i1 = tpig[0];
|
|
|
|
|
|
const bool is_neox = mode & 2;
|
|
const bool is_neox = mode & 2;
|
|
- const float theta_scale = pow(10000.0, -2.0f/n_dims);
|
|
|
|
|
|
+ const float theta_scale = pow(freq_base, -2.0f/n_dims);
|
|
|
|
|
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
|
const int64_t p = ((mode & 1) == 0 ? n_past + i2 : i2);
|
|
|
|
|
|
- float theta = (float)p;
|
|
|
|
|
|
+ float theta = freq_scale * (float)p;
|
|
|
|
|
|
if (!is_neox) {
|
|
if (!is_neox) {
|
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
@@ -1489,6 +1478,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#if QK_K == 256
|
|
kernel void kernel_mul_mat_q4_K_f32(
|
|
kernel void kernel_mul_mat_q4_K_f32(
|
|
device const void * src0,
|
|
device const void * src0,
|
|
device const float * src1,
|
|
device const float * src1,
|
|
@@ -1496,131 +1486,180 @@ kernel void kernel_mul_mat_q4_K_f32(
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne0,
|
|
constant int64_t & ne0,
|
|
- threadgroup float * sum [[threadgroup(0)]],
|
|
|
|
|
|
+ constant int64_t & ne01[[buffer(4)]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
- uint2 tpitg[[thread_position_in_threadgroup]],
|
|
|
|
- uint2 tptg[[threads_per_threadgroup]]) {
|
|
|
|
-
|
|
|
|
- const int nb = ne00/QK_K;
|
|
|
|
-
|
|
|
|
- const int64_t r0 = tgpig.x;
|
|
|
|
- const int64_t r1 = tgpig.y;
|
|
|
|
-
|
|
|
|
- const int nth = tptg.x*tptg.y;
|
|
|
|
- const int ith = tptg.y*tpitg.x + tpitg.y;
|
|
|
|
-
|
|
|
|
- device const block_q4_K * x = (device const block_q4_K *) src0 + r0*nb;
|
|
|
|
- device const float * yy = (device const float *) src1 + r1*ne10;
|
|
|
|
-
|
|
|
|
- float sumf = 0;
|
|
|
|
-
|
|
|
|
-#if QK_K == 256
|
|
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
|
|
const uint16_t kmask1 = 0x3f3f;
|
|
const uint16_t kmask1 = 0x3f3f;
|
|
const uint16_t kmask2 = 0x0f0f;
|
|
const uint16_t kmask2 = 0x0f0f;
|
|
const uint16_t kmask3 = 0xc0c0;
|
|
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
|
|
- const int tid = tpitg.y; // 0...16
|
|
|
|
- const int il = tid/4; // 0...3
|
|
|
|
- const int ir = tid - 4*il;// 0...3
|
|
|
|
- const int n = 4;
|
|
|
|
|
|
+ const int ix = tiisg/8; // 0...3
|
|
|
|
+ const int it = tiisg%8; // 0...7
|
|
|
|
+ const int im = it/4; // 0 or 1
|
|
|
|
+ const int ir = it%4; // 0...3
|
|
|
|
|
|
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
|
|
|
- const int in = il%2;
|
|
|
|
|
|
+ const int nb = ne00/QK_K;
|
|
|
|
+ const int r0 = tgpig.x;
|
|
|
|
+ const int r1 = tgpig.y;
|
|
|
|
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
|
|
+ const int ib_row = first_row * nb;
|
|
|
|
+ device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
|
|
|
|
+ device const float * y = (device const float *) src1 + r1*ne10;
|
|
|
|
+ float yl[16];
|
|
|
|
+ float yh[16];
|
|
|
|
+ float sumf[N_DST]={0.f}, all_sum;
|
|
|
|
|
|
- const int l0 = n*(2*ir + in);
|
|
|
|
- const int q_offset = 32*im + l0;
|
|
|
|
- const int y_offset = 64*im + l0;
|
|
|
|
|
|
+ const int step = sizeof(block_q4_K) * nb / 2;
|
|
|
|
|
|
- uchar2 sc1, sc2, sc3, sc4;
|
|
|
|
|
|
+ device const float * y4 = y + ix * QK_K + 64 * im + 8 * ir;
|
|
|
|
|
|
- for (int i = tpitg.x; i < nb; i += tptg.x) {
|
|
|
|
|
|
+ uint16_t sc16[4];
|
|
|
|
+ thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
|
|
|
|
|
|
- device const uint8_t * q1 = (x + i)->qs + q_offset;
|
|
|
|
- device const uint8_t * q2 = q1 + 64;
|
|
|
|
- device const float * y1 = yy + i*QK_K + y_offset;
|
|
|
|
- device const float * y2 = y1 + 128;
|
|
|
|
|
|
+ for (int ib = ix; ib < nb; ib += 4) {
|
|
|
|
|
|
- const float dall = (float)((x + i)->d);
|
|
|
|
- const float dmin = (float)((x + i)->dmin);
|
|
|
|
|
|
+ float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ for (int i = 0; i < 8; ++i) {
|
|
|
|
+ yl[i+0] = y4[i+ 0]; sumy[0] += yl[i+0];
|
|
|
|
+ yl[i+8] = y4[i+ 32]; sumy[1] += yl[i+8];
|
|
|
|
+ yh[i+0] = y4[i+128]; sumy[2] += yh[i+0];
|
|
|
|
+ yh[i+8] = y4[i+160]; sumy[3] += yh[i+8];
|
|
|
|
+ }
|
|
|
|
|
|
- device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
|
|
|
|
- sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
|
|
|
|
- sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
|
|
|
|
- sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
|
|
|
|
- sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
|
|
|
|
|
|
+ device const uint16_t * sc = (device const uint16_t *)x[ib].scales + im;
|
|
|
|
+ device const uint16_t * q1 = (device const uint16_t *)x[ib].qs + 16 * im + 4 * ir;
|
|
|
|
+ device const half * dh = &x[ib].d;
|
|
|
|
+
|
|
|
|
+ for (int row = 0; row < N_DST; row++) {
|
|
|
|
+
|
|
|
|
+ sc16[0] = sc[0] & kmask1;
|
|
|
|
+ sc16[1] = sc[2] & kmask1;
|
|
|
|
+ sc16[2] = ((sc[4] >> 0) & kmask2) | ((sc[0] & kmask3) >> 2);
|
|
|
|
+ sc16[3] = ((sc[4] >> 4) & kmask2) | ((sc[2] & kmask3) >> 2);
|
|
|
|
+
|
|
|
|
+ device const uint16_t * q2 = q1 + 32;
|
|
|
|
+
|
|
|
|
+ float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ for (int i = 0; i < 8; i += 2) {
|
|
|
|
+ acc1[0] += yl[i+0] * (q1[i/2] & 0x000F);
|
|
|
|
+ acc1[1] += yl[i+1] * (q1[i/2] & 0x0F00);
|
|
|
|
+ acc1[2] += yl[i+8] * (q1[i/2] & 0x00F0);
|
|
|
|
+ acc1[3] += yl[i+9] * (q1[i/2] & 0xF000);
|
|
|
|
+ acc2[0] += yh[i+0] * (q2[i/2] & 0x000F);
|
|
|
|
+ acc2[1] += yh[i+1] * (q2[i/2] & 0x0F00);
|
|
|
|
+ acc2[2] += yh[i+8] * (q2[i/2] & 0x00F0);
|
|
|
|
+ acc2[3] += yh[i+9] * (q2[i/2] & 0xF000);
|
|
|
|
+ }
|
|
|
|
|
|
- float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
- float smin = 0;
|
|
|
|
- for (int l = 0; l < n; ++l) {
|
|
|
|
|
|
+ float dall = dh[0];
|
|
|
|
+ float dmin = dh[1];
|
|
|
|
+ sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8[0] +
|
|
|
|
+ (acc1[2] + 1.f/256.f * acc1[3]) * sc8[1] * 1.f/16.f +
|
|
|
|
+ (acc2[0] + 1.f/256.f * acc2[1]) * sc8[4] +
|
|
|
|
+ (acc2[2] + 1.f/256.f * acc2[3]) * sc8[5] * 1.f/16.f) -
|
|
|
|
+ dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
|
|
|
|
+
|
|
|
|
+ q1 += step;
|
|
|
|
+ sc += step;
|
|
|
|
+ dh += step;
|
|
|
|
+ }
|
|
|
|
|
|
- s[0] += y1[l] * (q1[l] & 0xF); s[1] += y1[l+32] * (q1[l] >> 4);
|
|
|
|
- s[2] += y2[l] * (q2[l] & 0xF); s[3] += y2[l+32] * (q2[l] >> 4);
|
|
|
|
- smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
|
|
|
|
|
|
+ y4 += 4 * QK_K;
|
|
|
|
+ }
|
|
|
|
|
|
|
|
+ for (int row = 0; row < N_DST; ++row) {
|
|
|
|
+ all_sum = simd_sum(sumf[row]);
|
|
|
|
+ if (tiisg == 0) {
|
|
|
|
+ dst[r1*ne0 + first_row + row] = all_sum;
|
|
}
|
|
}
|
|
- sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
|
|
|
-
|
|
|
|
}
|
|
}
|
|
|
|
+}
|
|
#else
|
|
#else
|
|
- uint16_t aux16[2];
|
|
|
|
- thread const uint8_t * scales = (thread const uint8_t *)aux16;
|
|
|
|
|
|
+kernel void kernel_mul_mat_q4_K_f32(
|
|
|
|
+ device const void * src0,
|
|
|
|
+ device const float * src1,
|
|
|
|
+ device float * dst,
|
|
|
|
+ constant int64_t & ne00,
|
|
|
|
+ constant int64_t & ne10,
|
|
|
|
+ constant int64_t & ne0,
|
|
|
|
+ constant int64_t & ne01[[buffer(4)]],
|
|
|
|
+ uint2 tgpig[[threadgroup_position_in_grid]],
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
|
|
- const int il = 4*tpitg.x;
|
|
|
|
|
|
+ const int ix = tiisg/4; // 0...7
|
|
|
|
+ const int it = tiisg%4; // 0...3
|
|
|
|
|
|
- for (int i = tpitg.y; i < nb; i += tptg.y) {
|
|
|
|
|
|
+ const int nb = ne00/QK_K;
|
|
|
|
+ const int r0 = tgpig.x;
|
|
|
|
+ const int r1 = tgpig.y;
|
|
|
|
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST;
|
|
|
|
+ const int ib_row = first_row * nb;
|
|
|
|
+ device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row;
|
|
|
|
+ device const float * y = (device const float *) src1 + r1*ne10;
|
|
|
|
+ float yl[8];
|
|
|
|
+ float yh[8];
|
|
|
|
+ float sumf[N_DST]={0.f}, all_sum;
|
|
|
|
|
|
- device const uint8_t * q = x[i].qs + il;
|
|
|
|
- device const float * y = yy + i * QK_K + il;
|
|
|
|
|
|
+ const int step = sizeof(block_q4_K) * nb / 2;
|
|
|
|
|
|
- const float d = (float)x[i].d[0];
|
|
|
|
- const float m = (float)x[i].d[1];
|
|
|
|
|
|
+ device const float * y4 = y + ix * QK_K + 8 * it;
|
|
|
|
|
|
- device const uint16_t * a = (device const uint16_t *)x[i].scales;
|
|
|
|
- aux16[0] = a[0] & 0x0f0f;
|
|
|
|
- aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
|
|
|
|
+ uint16_t sc16[4];
|
|
|
|
|
|
- for (int l = 0; l < 4; ++l) {
|
|
|
|
- sumf += d * scales[0] * (y[l+ 0] * (q[l] & 0xF) + y[l+16] * (q[l+16] & 0xF)) - m * scales[2] * (y[l+ 0] + y[l+16])
|
|
|
|
- + d * scales[1] * (y[l+32] * (q[l] >> 4) + y[l+48] * (q[l+16] >> 4)) - m * scales[3] * (y[l+32] + y[l+48]);
|
|
|
|
|
|
+ for (int ib = ix; ib < nb; ib += 8) {
|
|
|
|
+
|
|
|
|
+ float2 sumy = {0.f, 0.f};
|
|
|
|
+ for (int i = 0; i < 8; ++i) {
|
|
|
|
+ yl[i] = y4[i+ 0]; sumy[0] += yl[i];
|
|
|
|
+ yh[i] = y4[i+32]; sumy[1] += yh[i];
|
|
}
|
|
}
|
|
- }
|
|
|
|
-#endif
|
|
|
|
|
|
|
|
- sum[ith] = sumf;
|
|
|
|
|
|
+ device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
|
|
|
|
+ device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
|
|
|
+ device const half * dh = x[ib].d;
|
|
|
|
|
|
- //
|
|
|
|
- // Accumulate the sum from all threads in the threadgroup
|
|
|
|
- // This version is slightly faster than the commented out one below,
|
|
|
|
- // which I copy-pasted from ggerganov's q4_0 dot product for metal.
|
|
|
|
- //
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%4 == 0) {
|
|
|
|
- for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%16 == 0) {
|
|
|
|
- for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith == 0) {
|
|
|
|
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
|
|
- dst[r1*ne0 + r0] = sum[0];
|
|
|
|
- }
|
|
|
|
|
|
+ for (int row = 0; row < N_DST; row++) {
|
|
|
|
+
|
|
|
|
+ sc16[0] = sc[0] & 0x000f;
|
|
|
|
+ sc16[1] = sc[0] & 0x0f00;
|
|
|
|
+ sc16[2] = sc[0] & 0x00f0;
|
|
|
|
+ sc16[3] = sc[0] & 0xf000;
|
|
|
|
+
|
|
|
|
+ float2 acc1 = {0.f, 0.f};
|
|
|
|
+ float2 acc2 = {0.f, 0.f};
|
|
|
|
+ for (int i = 0; i < 8; i += 2) {
|
|
|
|
+ acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
|
|
|
|
+ acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
|
|
|
|
+ acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
|
|
|
|
+ acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ float dall = dh[0];
|
|
|
|
+ float dmin = dh[1];
|
|
|
|
+ sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
|
|
|
|
+ (acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
|
|
|
|
+ dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
|
|
|
|
+
|
|
|
|
+ qs += step;
|
|
|
|
+ sc += step;
|
|
|
|
+ dh += step;
|
|
|
|
+ }
|
|
|
|
|
|
- //// accumulate the sum from all threads in the threadgroup
|
|
|
|
- //threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- //for (uint i = nth/2; i > 0; i /= 2) {
|
|
|
|
- // if (ith < i) {
|
|
|
|
- // sum[ith] += sum[ith + i];
|
|
|
|
- // }
|
|
|
|
- // threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- //}
|
|
|
|
|
|
+ y4 += 8 * QK_K;
|
|
|
|
+ }
|
|
|
|
|
|
- //if (ith == 0) {
|
|
|
|
- // dst[r1*ne0 + r0] = sum[0];
|
|
|
|
- //}
|
|
|
|
|
|
+ for (int row = 0; row < N_DST; ++row) {
|
|
|
|
+ all_sum = simd_sum(sumf[row]);
|
|
|
|
+ if (tiisg == 0) {
|
|
|
|
+ dst[r1*ne0 + first_row + row] = all_sum;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
+#endif
|
|
|
|
|
|
kernel void kernel_mul_mat_q5_K_f32(
|
|
kernel void kernel_mul_mat_q5_K_f32(
|
|
device const void * src0,
|
|
device const void * src0,
|
|
@@ -1629,39 +1668,39 @@ kernel void kernel_mul_mat_q5_K_f32(
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne0,
|
|
constant int64_t & ne0,
|
|
- threadgroup float * sum [[threadgroup(0)]],
|
|
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
- uint2 tpitg[[thread_position_in_threadgroup]],
|
|
|
|
- uint2 tptg[[threads_per_threadgroup]]) {
|
|
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
|
|
const int nb = ne00/QK_K;
|
|
const int nb = ne00/QK_K;
|
|
|
|
|
|
const int64_t r0 = tgpig.x;
|
|
const int64_t r0 = tgpig.x;
|
|
const int64_t r1 = tgpig.y;
|
|
const int64_t r1 = tgpig.y;
|
|
|
|
|
|
- device const block_q5_K * x = (device const block_q5_K *) src0 + r0*nb;
|
|
|
|
|
|
+ const int first_row = (r0 * N_SIMDGROUP + sgitg) * 2;
|
|
|
|
+
|
|
|
|
+ device const block_q5_K * x = (device const block_q5_K *) src0 + first_row*nb;
|
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
|
device const float * yy = (device const float *) src1 + r1*ne10;
|
|
|
|
|
|
- const int nth = tptg.x*tptg.y;
|
|
|
|
- const int ith = tptg.y*tpitg.x + tpitg.y;
|
|
|
|
|
|
+ float sumf[2]={0.f};
|
|
|
|
|
|
- float sumf = 0;
|
|
|
|
|
|
+ const int step = sizeof(block_q5_K) * nb;
|
|
|
|
|
|
#if QK_K == 256
|
|
#if QK_K == 256
|
|
|
|
+#
|
|
|
|
+ float yl[16], yh[16];
|
|
|
|
|
|
const uint16_t kmask1 = 0x3f3f;
|
|
const uint16_t kmask1 = 0x3f3f;
|
|
const uint16_t kmask2 = 0x0f0f;
|
|
const uint16_t kmask2 = 0x0f0f;
|
|
const uint16_t kmask3 = 0xc0c0;
|
|
const uint16_t kmask3 = 0xc0c0;
|
|
|
|
|
|
- const int tid = tpitg.y; // 0...16
|
|
|
|
- const int il = tid/4; // 0...3
|
|
|
|
- const int ir = tid - 4*il;// 0...3
|
|
|
|
- const int n = 4;
|
|
|
|
-
|
|
|
|
- const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
|
|
|
- const int in = il%2;
|
|
|
|
|
|
+ const int tid = tiisg/4;
|
|
|
|
+ const int ix = tiisg%4;
|
|
|
|
+ const int im = tid/4;
|
|
|
|
+ const int ir = tid%4;
|
|
|
|
+ const int n = 8;
|
|
|
|
|
|
- const int l0 = n*(2*ir + in);
|
|
|
|
|
|
+ const int l0 = n*ir;
|
|
const int q_offset = 32*im + l0;
|
|
const int q_offset = 32*im + l0;
|
|
const int y_offset = 64*im + l0;
|
|
const int y_offset = 64*im + l0;
|
|
|
|
|
|
@@ -1670,78 +1709,114 @@ kernel void kernel_mul_mat_q5_K_f32(
|
|
const uint8_t hm3 = hm1 << 4;
|
|
const uint8_t hm3 = hm1 << 4;
|
|
const uint8_t hm4 = hm2 << 4;
|
|
const uint8_t hm4 = hm2 << 4;
|
|
|
|
|
|
- uchar2 sc1, sc2, sc3, sc4;
|
|
|
|
|
|
+ uint16_t sc16[4];
|
|
|
|
+ thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
|
|
|
|
|
|
- for (int i = tpitg.x; i < nb; i += tptg.x) {
|
|
|
|
|
|
+ device const float * y1 = yy + ix*QK_K + y_offset;
|
|
|
|
+
|
|
|
|
+ for (int i = ix; i < nb; i += 4) {
|
|
|
|
+
|
|
|
|
+ device const uint8_t * q1 = x[i].qs + q_offset;
|
|
|
|
+ device const uint8_t * qh = x[i].qh + l0;
|
|
|
|
+ device const half * dh = &x[i].d;
|
|
|
|
+ device const uint16_t * a = (device const uint16_t *)x[i].scales + im;
|
|
|
|
|
|
- device const uint8_t * q1 = (x + i)->qs + q_offset;
|
|
|
|
- device const uint8_t * q2 = q1 + 64;
|
|
|
|
- device const uint8_t * qh = (x + i)->qh + l0;
|
|
|
|
- device const float * y1 = yy + i*QK_K + y_offset;
|
|
|
|
- device const float * y2 = y1 + 128;
|
|
|
|
|
|
+ device const float * y2 = y1 + 128;
|
|
|
|
+ float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ for (int l = 0; l < 8; ++l) {
|
|
|
|
+ yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
|
|
|
|
+ yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
|
|
|
|
+ yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
|
|
|
|
+ yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
|
|
|
|
+ }
|
|
|
|
|
|
- const float dall = (float)((x + i)->d);
|
|
|
|
- const float dmin = (float)((x + i)->dmin);
|
|
|
|
|
|
+ for (int row = 0; row < 2; ++row) {
|
|
|
|
|
|
- device const uint16_t * a = (device const uint16_t *)(x + i)->scales;
|
|
|
|
- sc1 = as_type<uchar2>((uint16_t)(a[im+0] & kmask1));
|
|
|
|
- sc2 = as_type<uchar2>((uint16_t)(a[im+2] & kmask1));
|
|
|
|
- sc3 = as_type<uchar2>((uint16_t)(((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2)));
|
|
|
|
- sc4 = as_type<uchar2>((uint16_t)(((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2)));
|
|
|
|
|
|
+ device const uint8_t * q2 = q1 + 64;
|
|
|
|
|
|
- float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
- float smin = 0;
|
|
|
|
- for (int l = 0; l < n; ++l) {
|
|
|
|
|
|
+ sc16[0] = a[0] & kmask1;
|
|
|
|
+ sc16[1] = a[2] & kmask1;
|
|
|
|
+ sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
|
|
|
|
+ sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
|
|
|
|
|
|
- s[0] += y1[l+ 0] * ((q1[l] & 0xF) + (qh[l] & hm1 ? 16 : 0));
|
|
|
|
- s[1] += y1[l+32] * ((q1[l] >> 4) + (qh[l] & hm2 ? 16 : 0));
|
|
|
|
- s[2] += y2[l+ 0] * ((q2[l] & 0xF) + (qh[l] & hm3 ? 16 : 0));
|
|
|
|
- s[3] += y2[l+32] * ((q2[l] >> 4) + (qh[l] & hm4 ? 16 : 0));
|
|
|
|
- smin += y1[l] * sc2[0] + y1[l+32] * sc2[1] + y2[l] * sc4[0] + y2[l+32] * sc4[1];
|
|
|
|
|
|
+ float4 acc = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ for (int l = 0; l < n; ++l) {
|
|
|
|
+ uint8_t h = qh[l];
|
|
|
|
+ acc[0] += yl[l+0] * ((uint16_t)(q1[l] & 0x0F) + (h & hm1 ? 16 : 0));
|
|
|
|
+ acc[1] += yl[l+8] * ((uint16_t)(q1[l] & 0xF0) + (h & hm2 ? 256 : 0));
|
|
|
|
+ acc[2] += yh[l+0] * ((uint16_t)(q2[l] & 0x0F) + (h & hm3 ? 16 : 0));
|
|
|
|
+ acc[3] += yh[l+8] * ((uint16_t)(q2[l] & 0xF0) + (h & hm4 ? 256 : 0));
|
|
|
|
+ }
|
|
|
|
+ const float dall = dh[0];
|
|
|
|
+ const float dmin = dh[1];
|
|
|
|
+ sumf[row] += dall * (acc[0] * sc8[0] + acc[1] * sc8[1] * 1.f/16.f + acc[2] * sc8[4] + acc[3] * sc8[5] * 1.f/16.f) -
|
|
|
|
+ dmin * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
|
|
|
|
+
|
|
|
|
+ q1 += step;
|
|
|
|
+ qh += step;
|
|
|
|
+ dh += step/2;
|
|
|
|
+ a += step/2;
|
|
|
|
|
|
}
|
|
}
|
|
- sumf += dall * (s[0] * sc1[0] + s[1] * sc1[1] + s[2] * sc3[0] + s[3] * sc3[1]) - dmin * smin;
|
|
|
|
|
|
+
|
|
|
|
+ y1 += 4 * QK_K;
|
|
|
|
|
|
}
|
|
}
|
|
#else
|
|
#else
|
|
- const int il = 4 * tpitg.x; // 0, 4, 8, 12
|
|
|
|
- const int im = il/8; // 0, 0, 1, 1
|
|
|
|
- const int in = il%8; // 0, 4, 0, 4
|
|
|
|
|
|
+ float yl[8], yh[8];
|
|
|
|
|
|
- for (int i = tpitg.y; i < nb; i += tptg.y) {
|
|
|
|
|
|
+ const int il = 4 * (tiisg/8); // 0, 4, 8, 12
|
|
|
|
+ const int ix = tiisg%8;
|
|
|
|
+ const int im = il/8; // 0, 0, 1, 1
|
|
|
|
+ const int in = il%8; // 0, 4, 0, 4
|
|
|
|
|
|
- const float d = (float)x[i].d;
|
|
|
|
|
|
+ device const float * y = yy + ix*QK_K + il;
|
|
|
|
+
|
|
|
|
+ for (int i = ix; i < nb; i += 8) {
|
|
|
|
+
|
|
|
|
+ float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ for (int l = 0; l < 4; ++l) {
|
|
|
|
+ yl[l+0] = y[l+ 0];
|
|
|
|
+ yl[l+4] = y[l+16];
|
|
|
|
+ yh[l+0] = y[l+32];
|
|
|
|
+ yh[l+4] = y[l+48];
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ device const half * dh = &x[i].d;
|
|
device const uint8_t * q = x[i].qs + il;
|
|
device const uint8_t * q = x[i].qs + il;
|
|
device const uint8_t * h = x[i].qh + in;
|
|
device const uint8_t * h = x[i].qh + in;
|
|
device const int8_t * s = x[i].scales;
|
|
device const int8_t * s = x[i].scales;
|
|
- device const float * y = yy + i*QK_K + il;
|
|
|
|
|
|
|
|
- for (int l = 0; l < 4; ++l) {
|
|
|
|
- const uint8_t hl = h[l] >> im;
|
|
|
|
- sumf += y[l+ 0] * d * s[0] * ((q[l+ 0] & 0xF) - (hl & 0x01 ? 0 : 16))
|
|
|
|
- + y[l+16] * d * s[1] * ((q[l+16] & 0xF) - (hl & 0x04 ? 0 : 16))
|
|
|
|
- + y[l+32] * d * s[2] * ((q[l+ 0] >> 4) - (hl & 0x10 ? 0 : 16))
|
|
|
|
- + y[l+48] * d * s[3] * ((q[l+16] >> 4) - (hl & 0x40 ? 0 : 16));
|
|
|
|
|
|
+ for (int row = 0; row < 2; ++row) {
|
|
|
|
+
|
|
|
|
+ const float d = dh[0];
|
|
|
|
+
|
|
|
|
+ float2 acc = {0.f, 0.f};
|
|
|
|
+ for (int l = 0; l < 4; ++l) {
|
|
|
|
+ const uint8_t hl = h[l] >> im;
|
|
|
|
+ acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
|
|
|
|
+ + yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
|
|
|
|
+ acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
|
|
|
|
+ + yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
|
|
|
|
+ }
|
|
|
|
+ sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
|
|
|
|
+
|
|
|
|
+ q += step;
|
|
|
|
+ h += step;
|
|
|
|
+ s += step;
|
|
|
|
+ dh += step/2;
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ y += 8 * QK_K;
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
- sum[ith] = sumf;
|
|
|
|
|
|
|
|
- //
|
|
|
|
- // Accumulate the sum from all threads in the threadgroup
|
|
|
|
- //
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%4 == 0) {
|
|
|
|
- sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%16 == 0) {
|
|
|
|
- sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith == 0) {
|
|
|
|
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
|
|
- dst[r1*ne0 + r0] = sum[0];
|
|
|
|
|
|
+ for (int row = 0; row < 2; ++row) {
|
|
|
|
+ const float tot = simd_sum(sumf[row]);
|
|
|
|
+ if (tiisg == 0) {
|
|
|
|
+ dst[r1*ne0 + first_row + row] = tot;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
@@ -1753,10 +1828,9 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne00,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne10,
|
|
constant int64_t & ne0,
|
|
constant int64_t & ne0,
|
|
- threadgroup float * sum [[threadgroup(0)]],
|
|
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
uint2 tgpig[[threadgroup_position_in_grid]],
|
|
- uint2 tpitg[[thread_position_in_threadgroup]],
|
|
|
|
- uint2 tptg[[threads_per_threadgroup]]) {
|
|
|
|
|
|
+ uint tiisg[[thread_index_in_simdgroup]],
|
|
|
|
+ uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
|
|
|
|
|
const uint8_t kmask1 = 0x03;
|
|
const uint8_t kmask1 = 0x03;
|
|
const uint8_t kmask2 = 0x0C;
|
|
const uint8_t kmask2 = 0x0C;
|
|
@@ -1768,19 +1842,18 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
const int64_t r0 = tgpig.x;
|
|
const int64_t r0 = tgpig.x;
|
|
const int64_t r1 = tgpig.y;
|
|
const int64_t r1 = tgpig.y;
|
|
|
|
|
|
- device const block_q6_K * x = (device const block_q6_K *) src0 + r0*nb;
|
|
|
|
- device const float * yy = (device const float *) src1 + r1*ne10;
|
|
|
|
|
|
+ const int row = 2 * r0 + sgitg;
|
|
|
|
|
|
- const int nth = tptg.x*tptg.y;
|
|
|
|
- const int ith = tptg.y*tpitg.x + tpitg.y;
|
|
|
|
|
|
+ device const block_q6_K * x = (device const block_q6_K *) src0 + row * nb; //r0*nb;
|
|
|
|
+ device const float * yy = (device const float *) src1 + r1*ne10;
|
|
|
|
|
|
float sumf = 0;
|
|
float sumf = 0;
|
|
|
|
|
|
#if QK_K == 256
|
|
#if QK_K == 256
|
|
- // Note: we absolutely assume that tptg.y = 16 and QK_K = 256!
|
|
|
|
- const int iqs = 16 * tpitg.y;
|
|
|
|
- const int ip = iqs / 128; // 0 or 1
|
|
|
|
- const int il = (iqs - 128*ip)/16; // 0...7
|
|
|
|
|
|
+ const int tid = tiisg/2;
|
|
|
|
+ const int ix = tiisg%2;
|
|
|
|
+ const int ip = tid/8; // 0 or 1
|
|
|
|
+ const int il = tid%8;
|
|
const int n = 4;
|
|
const int n = 4;
|
|
const int l0 = n*il;
|
|
const int l0 = n*il;
|
|
const int is = 8*ip + l0/16;
|
|
const int is = 8*ip + l0/16;
|
|
@@ -1789,9 +1862,10 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
const int q_offset_l = 64*ip + l0;
|
|
const int q_offset_l = 64*ip + l0;
|
|
const int q_offset_h = 32*ip + l0;
|
|
const int q_offset_h = 32*ip + l0;
|
|
|
|
|
|
- for (int i = tpitg.x; i < nb; i += tptg.x) {
|
|
|
|
|
|
+ for (int i = ix; i < nb; i += 2) {
|
|
|
|
|
|
- device const uint8_t * ql = x[i].ql + q_offset_l;
|
|
|
|
|
|
+ device const uint8_t * q1 = x[i].ql + q_offset_l;
|
|
|
|
+ device const uint8_t * q2 = q1 + 32;
|
|
device const uint8_t * qh = x[i].qh + q_offset_h;
|
|
device const uint8_t * qh = x[i].qh + q_offset_h;
|
|
device const int8_t * sc = x[i].scales + is;
|
|
device const int8_t * sc = x[i].scales + is;
|
|
|
|
|
|
@@ -1801,19 +1875,21 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
|
|
|
|
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
|
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
|
for (int l = 0; l < n; ++l) {
|
|
for (int l = 0; l < n; ++l) {
|
|
- sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
|
|
|
- sums[1] += y[l+32] * ((int8_t)((ql[l+32] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
|
|
|
- sums[2] += y[l+64] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
|
|
|
|
- sums[3] += y[l+96] * ((int8_t)((ql[l+32] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
|
|
|
|
|
+ sums[0] += y[l+ 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
|
|
|
+ sums[1] += y[l+32] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
|
|
|
+ sums[2] += y[l+64] * ((int8_t)((q1[l] >> 4) | ((qh[l] & kmask3) << 0)) - 32);
|
|
|
|
+ sums[3] += y[l+96] * ((int8_t)((q2[l] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
|
}
|
|
}
|
|
|
|
|
|
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
|
|
sumf += dall * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
|
|
|
|
|
|
}
|
|
}
|
|
|
|
+
|
|
#else
|
|
#else
|
|
- const int il = 4*tpitg.x; // 0, 4, 8, 12
|
|
|
|
|
|
+ const int ix = tiisg/4;
|
|
|
|
+ const int il = 4*(tiisg%4);
|
|
|
|
|
|
- for (int i = tpitg.y; i < nb; i += tptg.y) {
|
|
|
|
|
|
+ for (int i = ix; i < nb; i += 8) {
|
|
device const float * y = yy + i * QK_K + il;
|
|
device const float * y = yy + i * QK_K + il;
|
|
device const uint8_t * ql = x[i].ql + il;
|
|
device const uint8_t * ql = x[i].ql + il;
|
|
device const uint8_t * qh = x[i].qh + il;
|
|
device const uint8_t * qh = x[i].qh + il;
|
|
@@ -1833,23 +1909,8 @@ kernel void kernel_mul_mat_q6_K_f32(
|
|
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
- sum[ith] = sumf;
|
|
|
|
-
|
|
|
|
- //
|
|
|
|
- // Accumulate the sum from all threads in the threadgroup
|
|
|
|
- //
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%4 == 0) {
|
|
|
|
- for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i];
|
|
|
|
|
|
+ const float tot = simd_sum(sumf);
|
|
|
|
+ if (tiisg == 0) {
|
|
|
|
+ dst[r1*ne0 + row] = tot;
|
|
}
|
|
}
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith%16 == 0) {
|
|
|
|
- for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i];
|
|
|
|
- }
|
|
|
|
- threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
|
- if (ith == 0) {
|
|
|
|
- for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
|
|
|
- dst[r1*ne0 + r0] = sum[0];
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
}
|
|
}
|