|
@@ -1,5 +1,5 @@
|
|
/**
|
|
/**
|
|
- * llama.cpp - git e782c9e735f93ab4767ffc37462c523b73a17ddc
|
|
|
|
|
|
+ * llama.cpp - git 7c529cede6e84054e77a3eceab31c53de7b2f55b
|
|
*
|
|
*
|
|
* MIT License
|
|
* MIT License
|
|
*
|
|
*
|
|
@@ -246,7 +246,7 @@ typedef struct {
|
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
|
|
|
|
|
#define WARP_SIZE 32
|
|
#define WARP_SIZE 32
|
|
-#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
|
|
|
|
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
|
|
|
|
|
#define CUDA_ADD_BLOCK_SIZE 256
|
|
#define CUDA_ADD_BLOCK_SIZE 256
|
|
#define CUDA_MUL_BLOCK_SIZE 256
|
|
#define CUDA_MUL_BLOCK_SIZE 256
|
|
@@ -358,12 +358,10 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols) {
|
|
|
|
|
|
+static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) {
|
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
|
const int tid = threadIdx.x;
|
|
const int tid = threadIdx.x;
|
|
|
|
|
|
- const float eps = 1e-6f;
|
|
|
|
-
|
|
|
|
float tmp = 0.0f; // partial sum for thread in warp
|
|
float tmp = 0.0f; // partial sum for thread in warp
|
|
|
|
|
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
|
for (int col = tid; col < ncols; col += WARP_SIZE) {
|
|
@@ -961,12 +959,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
uint16_t aux[4];
|
|
uint16_t aux[4];
|
|
const uint8_t * sc = (const uint8_t *)aux;
|
|
const uint8_t * sc = (const uint8_t *)aux;
|
|
|
|
|
|
|
|
+#if K_QUANTS_PER_ITERATION == 2
|
|
|
|
+ uint32_t q32[4];
|
|
|
|
+ const uint8_t * q4 = (const uint8_t *)q32;
|
|
|
|
+#else
|
|
|
|
+ uint16_t q16[4];
|
|
|
|
+ const uint8_t * q4 = (const uint8_t *)q16;
|
|
|
|
+#endif
|
|
|
|
+
|
|
float tmp = 0; // partial sum for thread in warp
|
|
float tmp = 0; // partial sum for thread in warp
|
|
|
|
|
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
|
|
|
|
- const uint8_t * q1 = x[i].qs + q_offset;
|
|
|
|
- const uint8_t * q2 = q1 + 64;
|
|
|
|
const float * y1 = yy + i*QK_K + y_offset;
|
|
const float * y1 = yy + i*QK_K + y_offset;
|
|
const float * y2 = y1 + 128;
|
|
const float * y2 = y1 + 128;
|
|
|
|
|
|
@@ -979,14 +983,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
|
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
|
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
|
|
|
|
|
|
|
+#if K_QUANTS_PER_ITERATION == 2
|
|
|
|
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
|
|
|
|
+ const uint32_t * q2 = q1 + 16;
|
|
|
|
+
|
|
|
|
+ q32[0] = q1[0] & 0x0f0f0f0f;
|
|
|
|
+ q32[1] = q1[0] & 0xf0f0f0f0;
|
|
|
|
+ q32[2] = q2[0] & 0x0f0f0f0f;
|
|
|
|
+ q32[3] = q2[0] & 0xf0f0f0f0;
|
|
|
|
+
|
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
float smin = 0;
|
|
float smin = 0;
|
|
- for (int l = 0; l < n; ++l) {
|
|
|
|
- s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
|
|
|
- s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
|
|
|
|
|
+ for (int l = 0; l < 4; ++l) {
|
|
|
|
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4];
|
|
|
|
+ s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12];
|
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
|
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
|
}
|
|
}
|
|
- tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
|
|
|
|
|
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
|
|
|
+#else
|
|
|
|
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
|
|
|
|
+ const uint16_t * q2 = q1 + 32;
|
|
|
|
+
|
|
|
|
+ q16[0] = q1[0] & 0x0f0f;
|
|
|
|
+ q16[1] = q1[0] & 0xf0f0;
|
|
|
|
+ q16[2] = q2[0] & 0x0f0f;
|
|
|
|
+ q16[3] = q2[0] & 0xf0f0;
|
|
|
|
+
|
|
|
|
+ float4 s = {0.f, 0.f, 0.f, 0.f};
|
|
|
|
+ float smin = 0;
|
|
|
|
+ for (int l = 0; l < 2; ++l) {
|
|
|
|
+ s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
|
|
|
|
+ s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
|
|
|
|
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
|
|
|
+ }
|
|
|
|
+ tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
|
|
|
|
+#endif
|
|
|
|
|
|
}
|
|
}
|
|
#else
|
|
#else
|
|
@@ -1066,10 +1097,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
uint16_t aux[4];
|
|
uint16_t aux[4];
|
|
const uint8_t * sc = (const uint8_t *)aux;
|
|
const uint8_t * sc = (const uint8_t *)aux;
|
|
|
|
|
|
|
|
+ uint16_t q16[8];
|
|
|
|
+ const uint8_t * q4 = (const uint8_t *)q16;
|
|
|
|
+
|
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
|
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
|
|
|
|
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
|
const uint8_t * ql1 = x[i].qs + q_offset;
|
|
- const uint8_t * ql2 = ql1 + 64;
|
|
|
|
const uint8_t * qh = x[i].qh + l0;
|
|
const uint8_t * qh = x[i].qh + l0;
|
|
const float * y1 = yy + i*QK_K + y_offset;
|
|
const float * y1 = yy + i*QK_K + y_offset;
|
|
const float * y2 = y1 + 128;
|
|
const float * y2 = y1 + 128;
|
|
@@ -1085,15 +1118,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|
|
|
|
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
|
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
|
float smin = 0;
|
|
float smin = 0;
|
|
|
|
+ const uint16_t * q1 = (const uint16_t *)ql1;
|
|
|
|
+ const uint16_t * q2 = q1 + 32;
|
|
|
|
+ q16[0] = q1[0] & 0x0f0f;
|
|
|
|
+ q16[1] = q1[8] & 0x0f0f;
|
|
|
|
+ q16[2] = (q1[0] >> 4) & 0x0f0f;
|
|
|
|
+ q16[3] = (q1[8] >> 4) & 0x0f0f;
|
|
|
|
+ q16[4] = q2[0] & 0x0f0f;
|
|
|
|
+ q16[5] = q2[8] & 0x0f0f;
|
|
|
|
+ q16[6] = (q2[0] >> 4) & 0x0f0f;
|
|
|
|
+ q16[7] = (q2[8] >> 4) & 0x0f0f;
|
|
for (int l = 0; l < n; ++l) {
|
|
for (int l = 0; l < n; ++l) {
|
|
- sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
|
|
|
- + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
|
|
|
- sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
|
|
|
- + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
|
|
|
- sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
|
|
|
- + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
|
|
|
- sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
|
|
|
- + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
|
|
|
|
|
+ sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
|
|
|
+ + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
|
|
|
+ sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
|
|
|
+ + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
|
|
|
+ sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
|
|
|
+ + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
|
|
|
+ sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
|
|
|
+ + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
|
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
|
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
|
}
|
|
}
|
|
@@ -1547,33 +1590,95 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
|
|
|
|
|
- const int bq8_offset = QR4_K * (iqs / QI8_1);
|
|
|
|
-
|
|
|
|
float sumf_d = 0.0f;
|
|
float sumf_d = 0.0f;
|
|
float sumf_m = 0.0f;
|
|
float sumf_m = 0.0f;
|
|
|
|
|
|
|
|
+#ifndef GGML_QKK_64
|
|
|
|
+
|
|
|
|
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
|
|
|
|
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
|
|
|
|
+
|
|
const float d = bq4_K->d;
|
|
const float d = bq4_K->d;
|
|
const float dmin = bq4_K->dmin;
|
|
const float dmin = bq4_K->dmin;
|
|
|
|
|
|
- const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
|
|
|
|
|
|
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
|
|
|
|
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
|
|
|
|
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
|
|
|
|
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
|
|
|
|
+
|
|
|
|
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
|
|
|
+ const int v1 = q4[0];
|
|
|
|
+ const int v2 = q4[4];
|
|
|
|
+
|
|
|
|
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales;
|
|
|
|
+ uint16_t aux[2];
|
|
|
|
+ const int j = bq8_offset/2;
|
|
|
|
+ if (j < 2) {
|
|
|
|
+ aux[0] = scales[j+0] & 0x3f3f;
|
|
|
|
+ aux[1] = scales[j+2] & 0x3f3f;
|
|
|
|
+ } else {
|
|
|
|
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
|
|
|
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
|
|
|
+ }
|
|
|
|
+ const uint8_t * sc = (const uint8_t *)aux;
|
|
|
|
+ const uint8_t * m = sc + 2;
|
|
|
|
|
|
for (int i = 0; i < QR4_K; ++i) {
|
|
for (int i = 0; i < QR4_K; ++i) {
|
|
- const int isc = bq8_offset + i;
|
|
|
|
-
|
|
|
|
- uint8_t sc, m;
|
|
|
|
- get_scale_min_k4(isc, bq4_K->scales, sc, m);
|
|
|
|
|
|
|
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
|
|
|
const float d8i = bq8i->d;
|
|
const float d8i = bq8i->d;
|
|
|
|
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
|
|
|
+ const int ui1 = q8[0];
|
|
|
|
+ const int ui2 = q8[4];
|
|
|
|
|
|
- const int vi = (v >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
|
|
+ const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
+ const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
|
|
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
|
|
|
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values
|
|
|
|
|
|
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
|
|
|
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
|
|
|
+
|
|
|
|
+ sumf_d += d8i * (dot1 * sc[i]);
|
|
|
|
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
|
|
}
|
|
}
|
|
|
|
|
|
return d*sumf_d - dmin*sumf_m;
|
|
return d*sumf_d - dmin*sumf_m;
|
|
|
|
+
|
|
|
|
+#else
|
|
|
|
+
|
|
|
|
+ uint16_t aux16[2];
|
|
|
|
+ const uint8_t * s = (const uint8_t *)aux16;
|
|
|
|
+
|
|
|
|
+ const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
|
|
|
+ aux16[0] = a[0] & 0x0f0f;
|
|
|
|
+ aux16[1] = (a[0] >> 4) & 0x0f0f;
|
|
|
|
+
|
|
|
|
+ const float dall = bq4_K->d[0];
|
|
|
|
+ const float dmin = bq4_K->d[1];
|
|
|
|
+
|
|
|
|
+ const float d8_1 = bq8_1[0].d;
|
|
|
|
+ const float d8_2 = bq8_1[1].d;
|
|
|
|
+
|
|
|
|
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
|
|
|
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
|
|
|
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
|
|
|
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
|
|
|
+
|
|
|
|
+ const int * q4 = (const int *)bq4_K->qs + iqs;
|
|
|
|
+ const int v1 = q4[0];
|
|
|
|
+ const int v2 = q4[4];
|
|
|
|
+
|
|
|
|
+ const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
|
|
|
+ const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
|
|
|
+ const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
|
|
|
+ const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
|
|
|
+
|
|
|
|
+ sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
|
|
|
+ sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
|
|
|
+
|
|
|
|
+ return dall * sumf_d - dmin * sumf_m;
|
|
|
|
+
|
|
|
|
+#endif
|
|
|
|
+
|
|
#else
|
|
#else
|
|
return 0.0f; // only to satisfy the compiler
|
|
return 0.0f; // only to satisfy the compiler
|
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
@@ -1585,7 +1690,11 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
|
|
|
|
|
- const int bq8_offset = QR5_K * (iqs / QI8_1);
|
|
|
|
|
|
+#ifndef GGML_QKK_64
|
|
|
|
+
|
|
|
|
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
|
|
|
|
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
|
|
|
|
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
|
|
|
|
|
|
float sumf_d = 0.0f;
|
|
float sumf_d = 0.0f;
|
|
float sumf_m = 0.0f;
|
|
float sumf_m = 0.0f;
|
|
@@ -1593,31 +1702,87 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|
const float d = bq5_K->d;
|
|
const float d = bq5_K->d;
|
|
const float dmin = bq5_K->dmin;
|
|
const float dmin = bq5_K->dmin;
|
|
|
|
|
|
- const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
|
|
|
|
|
|
+ const int vl1 = ql[0];
|
|
|
|
+ const int vl2 = ql[4];
|
|
|
|
|
|
- const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
|
|
|
|
|
|
+ const int vh1 = qh[0] >> bq8_offset;
|
|
|
|
+ const int vh2 = qh[4] >> bq8_offset;
|
|
|
|
|
|
- for (int i = 0; i < QR5_K; ++i) {
|
|
|
|
- const int isc = bq8_offset + i;
|
|
|
|
|
|
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales;
|
|
|
|
+ uint16_t aux[2];
|
|
|
|
+ const int j = bq8_offset/2;
|
|
|
|
+ if (j < 2) {
|
|
|
|
+ aux[0] = scales[j+0] & 0x3f3f;
|
|
|
|
+ aux[1] = scales[j+2] & 0x3f3f;
|
|
|
|
+ } else {
|
|
|
|
+ aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
|
|
|
|
+ aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
|
|
|
|
+ }
|
|
|
|
+ const uint8_t * sc = (const uint8_t *)aux;
|
|
|
|
+ const uint8_t * m = sc + 2;
|
|
|
|
|
|
- uint8_t sc, m;
|
|
|
|
- get_scale_min_k4(isc, bq5_K->scales, sc, m);
|
|
|
|
|
|
+ for (int i = 0; i < QR5_K; ++i) {
|
|
|
|
|
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
|
- const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
|
|
|
|
const float d8i = bq8i->d;
|
|
const float d8i = bq8i->d;
|
|
|
|
+ const int * q8 = (const int *)bq8i->qs + (iqs%4);
|
|
|
|
+ const int ui1 = q8[0];
|
|
|
|
+ const int ui2 = q8[4];
|
|
|
|
|
|
- const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
|
|
+ const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
+ const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
|
|
|
|
|
|
- const int vih = ((vh >> i) << 4) & 0x10101010;
|
|
|
|
|
|
+ const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
|
|
|
|
+ const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
|
|
|
|
|
|
- const int vi = vil | vih;
|
|
|
|
|
|
+ const int vi1 = vil1 | vih1;
|
|
|
|
+ const int vi2 = vil2 | vih2;
|
|
|
|
+
|
|
|
|
+ const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
|
|
|
|
+ const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
|
|
|
+
|
|
|
|
+ sumf_d += d8i * (dot1 * sc[i]);
|
|
|
|
+ sumf_m += d8i * (dot2 * m[i]);
|
|
|
|
|
|
- sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
|
|
|
|
- sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
|
|
|
|
}
|
|
}
|
|
|
|
|
|
return d*sumf_d - dmin*sumf_m;
|
|
return d*sumf_d - dmin*sumf_m;
|
|
|
|
+
|
|
|
|
+#else
|
|
|
|
+
|
|
|
|
+ const int8_t * s = bq5_K->scales;
|
|
|
|
+
|
|
|
|
+ const float d = bq5_K->d;
|
|
|
|
+
|
|
|
|
+ const float d8_1 = bq8_1[0].d;
|
|
|
|
+ const float d8_2 = bq8_1[1].d;
|
|
|
|
+
|
|
|
|
+ const int ui1 = *((const int *)bq8_1[0].qs + iqs);
|
|
|
|
+ const int ui2 = *((const int *)bq8_1[0].qs + iqs + 4);
|
|
|
|
+ const int ui3 = *((const int *)bq8_1[1].qs + iqs);
|
|
|
|
+ const int ui4 = *((const int *)bq8_1[1].qs + iqs + 4);
|
|
|
|
+
|
|
|
|
+ const int * ql = (const int *)bq5_K->qs + iqs;
|
|
|
|
+ const int vl1 = ql[0];
|
|
|
|
+ const int vl2 = ql[4];
|
|
|
|
+
|
|
|
|
+ const int step = 4 * iqs; // 0, 4, 8, 12
|
|
|
|
+ const int im = step/8; // = 0 for iqs = 0, 1, = 1 for iqs = 2, 3
|
|
|
|
+ const int in = step%8; // 0, 4, 0, 4
|
|
|
|
+ const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
|
|
|
+
|
|
|
|
+ const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
|
|
|
+ const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
|
|
|
+ const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
|
|
|
+ const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
|
|
|
+
|
|
|
|
+ const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
|
|
|
+ + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
|
|
|
+
|
|
|
|
+ return d * sumf_d;
|
|
|
|
+
|
|
|
|
+#endif
|
|
|
|
+
|
|
#else
|
|
#else
|
|
return 0.0f; // only to satisfy the compiler
|
|
return 0.0f; // only to satisfy the compiler
|
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
|
@@ -1771,11 +1936,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
|
|
|
|
|
+static __global__ void mul_mat_p021_f16_f32(
|
|
|
|
+ const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
|
|
|
+ const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
|
|
|
+
|
|
const half * x = (const half *) vx;
|
|
const half * x = (const half *) vx;
|
|
|
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
|
|
+ const int channel_x = channel / (nchannels_y / nchannels_x);
|
|
|
|
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_dst = nrows_x;
|
|
const int nrows_dst = nrows_x;
|
|
@@ -1791,7 +1960,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
}
|
|
}
|
|
|
|
|
|
// x is transposed and permuted
|
|
// x is transposed and permuted
|
|
- const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
|
|
|
|
|
|
+ const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
|
const float xi = __half2float(x[ix]);
|
|
const float xi = __half2float(x[ix]);
|
|
|
|
|
|
const int row_y = col_x;
|
|
const int row_y = col_x;
|
|
@@ -1819,12 +1988,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const
|
|
|
|
|
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
|
- const int row_stride_x, const int channel_stride_x) {
|
|
|
|
|
|
+ const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
|
|
|
|
|
const half * x = (const half *) vx;
|
|
const half * x = (const half *) vx;
|
|
|
|
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
|
|
|
+ const int channel_x = channel / channel_x_divisor;
|
|
|
|
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_y = ncols_x;
|
|
const int nrows_dst = nrows_x;
|
|
const int nrows_dst = nrows_x;
|
|
@@ -1841,7 +2011,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
- const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
|
|
|
|
|
|
+ const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
|
const float xi = __half2float(x[ix]);
|
|
const float xi = __half2float(x[ix]);
|
|
|
|
|
|
const int row_y = col_x;
|
|
const int row_y = col_x;
|
|
@@ -2053,10 +2223,10 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
|
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
|
norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
|
}
|
|
}
|
|
|
|
|
|
-static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
|
|
|
|
+static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
|
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
- rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
|
|
|
|
|
+ rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
|
|
}
|
|
}
|
|
|
|
|
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
|
@@ -2285,7 +2455,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
const dim3 block_nums(1, block_num_y, 1);
|
|
const dim3 block_nums(1, block_num_y, 1);
|
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
|
|
|
|
|
|
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
|
|
|
|
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
|
|
|
+ // is better amortized.
|
|
|
|
+ mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
|
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2294,7 +2467,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
|
const dim3 block_nums(1, block_num_y, 1);
|
|
const dim3 block_nums(1, block_num_y, 1);
|
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
|
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
|
|
|
|
|
|
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
|
|
|
|
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
|
|
|
|
+ // is better amortized.
|
|
|
|
+ mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
|
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2350,20 +2526,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
|
|
|
|
- const dim3 block_nums(1, nrows_x, nchannels_x);
|
|
|
|
|
|
+static void ggml_mul_mat_p021_f16_f32_cuda(
|
|
|
|
+ const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
|
|
|
+ const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
|
|
|
+
|
|
|
|
+ const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
- mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x);
|
|
|
|
|
|
+ mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
|
}
|
|
}
|
|
|
|
|
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
|
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
|
- const int nchannels_x, const int channel_stride_x, cudaStream_t stream) {
|
|
|
|
|
|
+ const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
|
|
|
|
|
- const dim3 block_nums(1, nrows_x, nchannels_x);
|
|
|
|
|
|
+ const dim3 block_nums(1, nrows_x, nchannels_y);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
const dim3 block_dims(WARP_SIZE, 1, 1);
|
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
|
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
|
- (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x);
|
|
|
|
|
|
+ (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
|
}
|
|
}
|
|
|
|
|
|
static void ggml_cpy_f32_f32_cuda(
|
|
static void ggml_cpy_f32_f32_cuda(
|
|
@@ -2449,20 +2628,53 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
|
int id;
|
|
int id;
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
CUDA_CHECK(cudaGetDevice(&id));
|
|
-
|
|
|
|
|
|
+#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
+ int nnz = 0;
|
|
|
|
+ size_t max_size = 0, tot_size = 0;
|
|
|
|
+#endif
|
|
|
|
+ size_t best_diff = 1ull << 36;
|
|
|
|
+ int ibest = -1;
|
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
|
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
|
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
|
cuda_buffer& b = g_cuda_buffer_pool[id][i];
|
|
- if (b.size >= size && b.ptr != nullptr) {
|
|
|
|
- void * ptr = b.ptr;
|
|
|
|
- *actual_size = b.size;
|
|
|
|
- b.ptr = nullptr;
|
|
|
|
- b.size = 0;
|
|
|
|
- return ptr;
|
|
|
|
|
|
+ if (b.ptr != nullptr) {
|
|
|
|
+#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
+ ++nnz;
|
|
|
|
+ tot_size += b.size;
|
|
|
|
+ if (b.size > max_size) max_size = b.size;
|
|
|
|
+#endif
|
|
|
|
+ if (b.size >= size) {
|
|
|
|
+ size_t diff = b.size - size;
|
|
|
|
+ if (diff < best_diff) {
|
|
|
|
+ best_diff = diff;
|
|
|
|
+ ibest = i;
|
|
|
|
+ if (!best_diff) {
|
|
|
|
+ void * ptr = b.ptr;
|
|
|
|
+ *actual_size = b.size;
|
|
|
|
+ b.ptr = nullptr;
|
|
|
|
+ b.size = 0;
|
|
|
|
+ return ptr;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if (ibest >= 0) {
|
|
|
|
+ cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
|
|
|
|
+ void * ptr = b.ptr;
|
|
|
|
+ *actual_size = b.size;
|
|
|
|
+ b.ptr = nullptr;
|
|
|
|
+ b.size = 0;
|
|
|
|
+ return ptr;
|
|
|
|
+ }
|
|
|
|
+#ifdef DEBUG_CUDA_MALLOC
|
|
|
|
+ fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
|
|
|
+ (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
|
|
|
+#endif
|
|
void * ptr;
|
|
void * ptr;
|
|
- CUDA_CHECK(cudaMalloc((void **) &ptr, size));
|
|
|
|
- *actual_size = size;
|
|
|
|
|
|
+ size_t look_ahead_size = (size_t) (1.05 * size);
|
|
|
|
+ look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
|
|
|
+ CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
|
|
|
+ *actual_size = look_ahead_size;
|
|
return ptr;
|
|
return ptr;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -2490,7 +2702,9 @@ static size_t g_scratch_offset = 0;
|
|
|
|
|
|
static int g_device_count = -1;
|
|
static int g_device_count = -1;
|
|
static int g_main_device = 0;
|
|
static int g_main_device = 0;
|
|
|
|
+#ifndef GGML_CUDA_FORCE_DMMV
|
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
|
|
|
+#endif
|
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
|
|
|
|
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
|
@@ -2513,7 +2727,9 @@ void ggml_init_cublas() {
|
|
g_tensor_split[id] = total_vram;
|
|
g_tensor_split[id] = total_vram;
|
|
total_vram += prop.totalGlobalMem;
|
|
total_vram += prop.totalGlobalMem;
|
|
|
|
|
|
|
|
+#ifndef GGML_CUDA_FORCE_DMMV
|
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
|
|
|
+#endif
|
|
}
|
|
}
|
|
for (int id = 0; id < g_device_count; ++id) {
|
|
for (int id = 0; id < g_device_count; ++id) {
|
|
g_tensor_split[id] /= total_vram;
|
|
g_tensor_split[id] /= total_vram;
|
|
@@ -2538,6 +2754,9 @@ void ggml_init_cublas() {
|
|
}
|
|
}
|
|
|
|
|
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
|
|
|
+ if (tensor_split == nullptr) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
bool all_zero = true;
|
|
bool all_zero = true;
|
|
for (int i = 0; i < g_device_count; ++i) {
|
|
for (int i = 0; i < g_device_count; ++i) {
|
|
if (tensor_split[i] != 0.0f) {
|
|
if (tensor_split[i] != 0.0f) {
|
|
@@ -2678,6 +2897,7 @@ inline void ggml_cuda_op_mul(
|
|
(void) dst;
|
|
(void) dst;
|
|
(void) src0_ddq_i;
|
|
(void) src0_ddq_i;
|
|
(void) i02;
|
|
(void) i02;
|
|
|
|
+ (void) i1;
|
|
}
|
|
}
|
|
|
|
|
|
inline void ggml_cuda_op_gelu(
|
|
inline void ggml_cuda_op_gelu(
|
|
@@ -2757,8 +2977,11 @@ inline void ggml_cuda_op_rms_norm(
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
|
|
|
|
|
|
+ float eps;
|
|
|
|
+ memcpy(&eps, dst->op_params, sizeof(float));
|
|
|
|
+
|
|
// compute
|
|
// compute
|
|
- rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
|
|
|
|
|
+ rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, eps, cudaStream_main);
|
|
|
|
|
|
(void) src1;
|
|
(void) src1;
|
|
(void) dst;
|
|
(void) dst;
|
|
@@ -2805,8 +3028,8 @@ inline void ggml_cuda_op_mul_mat_vec(
|
|
#endif
|
|
#endif
|
|
|
|
|
|
if (use_mul_mat_vec_q) {
|
|
if (use_mul_mat_vec_q) {
|
|
- int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
|
|
|
- padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
|
|
|
|
|
+ const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ?
|
|
|
|
+ ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
|
|
size_t as;
|
|
size_t as;
|
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
|
@@ -2973,13 +3196,18 @@ inline void ggml_cuda_op_rope(
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t ne00 = src0->ne[0];
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
|
|
|
|
- const int n_past = ((int32_t *) src1->data)[0];
|
|
|
|
- const int n_dims = ((int32_t *) src1->data)[1];
|
|
|
|
- const int mode = ((int32_t *) src1->data)[2];
|
|
|
|
- const int n_ctx = ((int32_t *) src1->data)[3];
|
|
|
|
|
|
+ const int n_past = ((int32_t *) dst->op_params)[0];
|
|
|
|
+ const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
|
|
+ const int mode = ((int32_t *) dst->op_params)[2];
|
|
|
|
+ const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
|
|
+ // RoPE alteration for extended context
|
|
|
|
|
|
- const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
|
|
|
- const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
|
|
|
|
|
+ float freq_base, freq_scale;
|
|
|
|
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
|
|
|
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
|
|
+
|
|
|
|
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
|
|
+ const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale;
|
|
|
|
|
|
bool is_glm = mode & 4;
|
|
bool is_glm = mode & 4;
|
|
|
|
|
|
@@ -2992,6 +3220,7 @@ inline void ggml_cuda_op_rope(
|
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ (void) src1;
|
|
(void) dst;
|
|
(void) dst;
|
|
(void) src0_ddq_i;
|
|
(void) src0_ddq_i;
|
|
(void) src1_ddf_i;
|
|
(void) src1_ddf_i;
|
|
@@ -3010,11 +3239,12 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
const int64_t i01_diff = i01_high - i01_low;
|
|
|
|
|
|
- const int n_past = ((int32_t *) src1->data)[0];
|
|
|
|
|
|
+ const int n_past = ((int32_t *) dst->op_params)[0];
|
|
|
|
|
|
// compute
|
|
// compute
|
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
|
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
|
|
|
|
|
|
|
+ (void) src1;
|
|
(void) dst;
|
|
(void) dst;
|
|
(void) src0_ddq_i;
|
|
(void) src0_ddq_i;
|
|
(void) src1_ddf_i;
|
|
(void) src1_ddf_i;
|
|
@@ -3082,6 +3312,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
|
const int64_t ne11 = use_src1 ? src1->ne[1] : 1;
|
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
|
const int64_t ne12 = use_src1 ? src1->ne[2] : 1;
|
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
|
const int64_t ne13 = use_src1 ? src1->ne[3] : 1;
|
|
|
|
+ const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
|
|
|
|
+
|
|
|
|
+ GGML_ASSERT(ne03 == ne13);
|
|
|
|
|
|
const int64_t ne0 = dst->ne[0];
|
|
const int64_t ne0 = dst->ne[0];
|
|
const int64_t ne1 = dst->ne[1];
|
|
const int64_t ne1 = dst->ne[1];
|
|
@@ -3093,12 +3326,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
|
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
|
|
|
|
|
// strides for iteration over dims 3 and 2
|
|
// strides for iteration over dims 3 and 2
|
|
- const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03;
|
|
|
|
- const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1;
|
|
|
|
|
|
+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
|
|
|
|
+ const int64_t num_iters = flatten_rows ? 1 : num_iters_0;
|
|
|
|
+ const int64_t stride_mod = flatten_rows ? num_iters_0 : 1;
|
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
|
const int64_t src0_stride = ne00 * ne01 * stride_mod;
|
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
|
const int64_t src1_stride = ne10 * ne11 * stride_mod;
|
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
|
const int64_t dst_stride = ne0 * ne1 * stride_mod;
|
|
|
|
|
|
|
|
+ const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
|
|
|
+ const int64_t i03_max = flatten_rows ? 1 : ne03;
|
|
|
|
+ const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12);
|
|
|
|
+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
|
|
|
|
+ GGML_ASSERT(!(flatten_rows && ne02 < ne12));
|
|
|
|
+
|
|
const size_t src0_ts = ggml_type_size(src0->type);
|
|
const size_t src0_ts = ggml_type_size(src0->type);
|
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
|
const size_t src0_bs = ggml_blck_size(src0->type);
|
|
|
|
|
|
@@ -3115,6 +3355,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
|
dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
|
|
|
|
|
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
|
|
|
+ GGML_ASSERT(!(split && ne02 < ne12));
|
|
|
|
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
|
|
|
|
|
@@ -3151,7 +3392,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
|
row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1];
|
|
} else {
|
|
} else {
|
|
row_low = 0;
|
|
row_low = 0;
|
|
- row_high = nrows0;
|
|
|
|
|
|
+ row_high = nrows0*i02_divisor;
|
|
}
|
|
}
|
|
if (row_low == row_high) {
|
|
if (row_low == row_high) {
|
|
continue;
|
|
continue;
|
|
@@ -3199,16 +3440,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
|
dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]);
|
|
}
|
|
}
|
|
|
|
|
|
- const int64_t i03_max = flatten_rows ? 1 : ne03;
|
|
|
|
- const int64_t i02_max = flatten_rows ? 1 : ne02;
|
|
|
|
- const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01;
|
|
|
|
-
|
|
|
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
|
for (int64_t i03 = 0; i03 < i03_max; i03++) {
|
|
const int64_t i13 = i03 % ne13;
|
|
const int64_t i13 = i03 % ne13;
|
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
|
for (int64_t i02 = 0; i02 < i02_max; i02++) {
|
|
const int64_t i12 = i02 % ne12;
|
|
const int64_t i12 = i02 % ne12;
|
|
|
|
|
|
- const int64_t i0 = i03*ne02 + i02;
|
|
|
|
|
|
+ const int64_t i0 = i03*i02_max + i02;
|
|
|
|
|
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
|
// i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
|
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
|
const int64_t i0_offset_low = row_low/rows_per_iter;
|
|
@@ -3242,10 +3479,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
const int64_t i11 = i13*ne12 + i12;
|
|
const int64_t i11 = i13*ne12 + i12;
|
|
|
|
|
|
// for split tensors the data begins at i0 == i0_offset_low
|
|
// for split tensors the data begins at i0 == i0_offset_low
|
|
- char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
|
|
|
- float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
|
|
|
|
|
+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
|
|
|
+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
|
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
|
float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
|
|
- float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
|
|
|
|
|
+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
|
|
|
|
|
|
// for split tensors the data pointer needs to be rounded down
|
|
// for split tensors the data pointer needs to be rounded down
|
|
// to the bin edge for i03, i02 bins beyond the first
|
|
// to the bin edge for i03, i02 bins beyond the first
|
|
@@ -3284,11 +3521,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- if (!src0_on_device || !src0_is_contiguous) {
|
|
|
|
|
|
+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
|
if (src0_is_f32) {
|
|
if (src0_is_f32) {
|
|
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
|
|
|
|
|
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
|
} else {
|
|
} else {
|
|
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main));
|
|
|
|
|
|
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -3442,6 +3679,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne02 = src0->ne[2];
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
|
|
|
|
+ const int64_t ne12 = src1->ne[2];
|
|
|
|
+
|
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
|
CUDA_CHECK(cudaSetDevice(g_main_device));
|
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
|
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
|
|
|
|
|
@@ -3454,7 +3693,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
|
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
|
|
|
|
|
- ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
|
|
|
|
|
+ ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main);
|
|
}
|
|
}
|
|
|
|
|
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
@@ -3468,6 +3707,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne01 = src0->ne[1];
|
|
const int64_t ne02 = src0->ne[2];
|
|
const int64_t ne02 = src0->ne[2];
|
|
|
|
|
|
|
|
+ const int64_t ne12 = src1->ne[2];
|
|
|
|
+
|
|
const int64_t nb01 = src0->nb[1];
|
|
const int64_t nb01 = src0->nb[1];
|
|
const int64_t nb02 = src0->nb[2];
|
|
const int64_t nb02 = src0->nb[2];
|
|
|
|
|
|
@@ -3486,7 +3727,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
const int row_stride_x = nb01 / sizeof(half);
|
|
const int row_stride_x = nb01 / sizeof(half);
|
|
const int channel_stride_x = nb02 / sizeof(half);
|
|
const int channel_stride_x = nb02 / sizeof(half);
|
|
|
|
|
|
- ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
|
|
|
|
|
+ ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main);
|
|
}
|
|
}
|
|
|
|
|
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3627,7 +3868,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
|
const size_t original_size = size;
|
|
const size_t original_size = size;
|
|
|
|
|
|
- // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
|
|
|
|
|
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
|
@@ -3643,7 +3884,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
- CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice));
|
|
|
|
|
|
+ CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
|
|
|
|
|
extra->data_device[id] = buf;
|
|
extra->data_device[id] = buf;
|
|
|
|
|
|
@@ -3723,7 +3964,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo
|
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
|
size_t offset = 0;
|
|
size_t offset = 0;
|
|
if (tensor->op == GGML_OP_VIEW) {
|
|
if (tensor->op == GGML_OP_VIEW) {
|
|
- memcpy(&offset, tensor->src[2]->data, sizeof(size_t));
|
|
|
|
|
|
+ memcpy(&offset, tensor->op_params, sizeof(size_t));
|
|
}
|
|
}
|
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
extra = ggml_cuda_alloc_temp_tensor_extra();
|
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
|
extra->data_device[g_main_device] = src0_ddc + offset;
|
|
@@ -3825,18 +4066,23 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
}
|
|
}
|
|
func = ggml_cuda_mul;
|
|
func = ggml_cuda_mul;
|
|
break;
|
|
break;
|
|
- case GGML_OP_GELU:
|
|
|
|
- if (!any_on_device) {
|
|
|
|
- return false;
|
|
|
|
- }
|
|
|
|
- func = ggml_cuda_gelu;
|
|
|
|
- break;
|
|
|
|
- case GGML_OP_SILU:
|
|
|
|
- if (!any_on_device) {
|
|
|
|
- return false;
|
|
|
|
- }
|
|
|
|
- func = ggml_cuda_silu;
|
|
|
|
- break;
|
|
|
|
|
|
+ case GGML_OP_UNARY:
|
|
|
|
+ switch (ggml_get_unary_op(tensor)) {
|
|
|
|
+ case GGML_UNARY_OP_GELU:
|
|
|
|
+ if (!any_on_device) {
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+ func = ggml_cuda_gelu;
|
|
|
|
+ break;
|
|
|
|
+ case GGML_UNARY_OP_SILU:
|
|
|
|
+ if (!any_on_device) {
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+ func = ggml_cuda_silu;
|
|
|
|
+ break;
|
|
|
|
+ default:
|
|
|
|
+ return false;
|
|
|
|
+ } break;
|
|
case GGML_OP_NORM:
|
|
case GGML_OP_NORM:
|
|
if (!any_on_device) {
|
|
if (!any_on_device) {
|
|
return false;
|
|
return false;
|