4 months ago · 80d41b579b
--- a/llama/ggml-metal-embed.metal
+++ b/llama/ggml-metal-embed.metal
@@ -2081,6 +2081,7 @@ typedef struct {
 
				     float    attn_factor;
			
 
				     float    beta_fast;
			
 
				     float    beta_slow;
			
 
				+    int32_t  sections[4];
			
 
				 } ggml_metal_kargs_rope;
			
 
				 
			
 
				 typedef struct {
			
@@ -4785,8 +4786,148 @@ kernel void kernel_rope_neox(
 
				     }
			
 
				 }
			
 
				 
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_rope_multi(
			
 
				+        constant ggml_metal_kargs_rope & args,
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device const char * src2,
			
 
				+        device       char * dst,
			
 
				+        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				+        ushort3 tptg [[threads_per_threadgroup]],
			
 
				+        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				+    const int i3 = tgpig[2];
			
 
				+    const int i2 = tgpig[1];
			
 
				+    const int i1 = tgpig[0];
			
 
				+
			
 
				+    float corr_dims[2];
			
 
				+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				+
			
 
				+    device const int32_t * pos = (device const int32_t *) src1;
			
 
				+
			
 
				+    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
			
 
				+    int sec_w = args.sections[1] + args.sections[0];
			
 
				+
			
 
				+    const float inv_ndims = -1.f/args.n_dims;
			
 
				+
			
 
				+    float cos_theta;
			
 
				+    float sin_theta;
			
 
				+
			
 
				+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				+        if (i0 < args.n_dims) {
			
 
				+            const int ic = i0/2;
			
 
				+            const int sector = ic % sect_dims;
			
 
				+
			
 
				+            float theta_base = (float) pos[i2];
			
 
				+            if (sector >= args.sections[0] && sector < sec_w) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2];
			
 
				+            }
			
 
				+            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				+            }
			
 
				+            else if (sector >= sec_w + args.sections[2]) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				+            }
			
 
				+
			
 
				+            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
			
 
				+
			
 
				+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				+
			
 
				+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				+
			
 
				+            const float x0 = src[0];
			
 
				+            const float x1 = src[args.n_dims/2];
			
 
				+
			
 
				+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
			
 
				+        } else {
			
 
				+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
			
 
				+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
			
 
				+
			
 
				+            dst_data[0] = src[0];
			
 
				+            dst_data[1] = src[1];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_rope_vision(
			
 
				+        constant ggml_metal_kargs_rope & args,
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device const char * src2,
			
 
				+        device       char * dst,
			
 
				+        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				+        ushort3 tptg [[threads_per_threadgroup]],
			
 
				+        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				+    const int i3 = tgpig[2];
			
 
				+    const int i2 = tgpig[1];
			
 
				+    const int i1 = tgpig[0];
			
 
				+
			
 
				+    float corr_dims[2];
			
 
				+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				+
			
 
				+    device const int32_t * pos = (device const int32_t *) src1;
			
 
				+
			
 
				+    int sect_dims = args.sections[0] + args.sections[1];
			
 
				+    int sec_w = args.sections[1] + args.sections[0];
			
 
				+    int sec_e = args.sections[2] + sec_w;
			
 
				+
			
 
				+    const float inv_ndims = -1.f/args.n_dims;
			
 
				+
			
 
				+    float cos_theta;
			
 
				+    float sin_theta;
			
 
				+
			
 
				+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				+        const int ic = i0/2;
			
 
				+        const int sector = ic % sect_dims;
			
 
				+
			
 
				+        float theta_base = (float) pos[i2];
			
 
				+        if (sector >= args.sections[0] && sector < sec_w) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2];
			
 
				+        }
			
 
				+        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				+        }
			
 
				+        else if (sector >= sec_w + args.sections[2]) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				+        }
			
 
				+
			
 
				+        int p = sector;
			
 
				+        if (sector >= sec_w + args.sections[2]) {
			
 
				+            p = sector - (sec_w + args.sections[2]);
			
 
				+        } else if (sector >= sec_w) {
			
 
				+            p = sector - sec_w;
			
 
				+        } else if (sector >= args.sections[0]) {
			
 
				+            p = sector - args.sections[0];
			
 
				+        }
			
 
				+
			
 
				+        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
			
 
				+
			
 
				+        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				+
			
 
				+        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				+        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				+
			
 
				+        const float x0 = src[0];
			
 
				+        const float x1 = src[args.n_dims];
			
 
				+
			
 
				+        dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				+        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
			
 
				 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
			
 
				+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
			
 
				+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
			
 
				 
			
 
				 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
			
 
				 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
			
@@ -4794,6 +4935,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 
				 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
			
 
				 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
			
 
				 
			
 
				+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
			
 
				+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
			
 
				+
			
 
				+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
			
 
				+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
			
 
				+
			
 
				 typedef void (im2col_t)(
			
 
				         device const float * x,
			
 
				         device        char * dst,
			
--- a/llama/ggml-metal-impl.h
+++ b/llama/ggml-metal-impl.h
@@ -169,6 +169,7 @@ typedef struct {
 
				     float    attn_factor;
			
 
				     float    beta_fast;
			
 
				     float    beta_slow;
			
 
				+    int32_t  sections[4];
			
 
				 } ggml_metal_kargs_rope;
			
 
				 
			
 
				 typedef struct {
			
--- a/llama/ggml-metal.metal
+++ b/llama/ggml-metal.metal
@@ -2594,8 +2594,148 @@ kernel void kernel_rope_neox(
 
				     }
			
 
				 }
			
 
				 
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_rope_multi(
			
 
				+        constant ggml_metal_kargs_rope & args,
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device const char * src2,
			
 
				+        device       char * dst,
			
 
				+        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				+        ushort3 tptg [[threads_per_threadgroup]],
			
 
				+        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				+    const int i3 = tgpig[2];
			
 
				+    const int i2 = tgpig[1];
			
 
				+    const int i1 = tgpig[0];
			
 
				+
			
 
				+    float corr_dims[2];
			
 
				+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				+
			
 
				+    device const int32_t * pos = (device const int32_t *) src1;
			
 
				+
			
 
				+    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
			
 
				+    int sec_w = args.sections[1] + args.sections[0];
			
 
				+
			
 
				+    const float inv_ndims = -1.f/args.n_dims;
			
 
				+
			
 
				+    float cos_theta;
			
 
				+    float sin_theta;
			
 
				+
			
 
				+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				+        if (i0 < args.n_dims) {
			
 
				+            const int ic = i0/2;
			
 
				+            const int sector = ic % sect_dims;
			
 
				+
			
 
				+            float theta_base = (float) pos[i2];
			
 
				+            if (sector >= args.sections[0] && sector < sec_w) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2];
			
 
				+            }
			
 
				+            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				+            }
			
 
				+            else if (sector >= sec_w + args.sections[2]) {
			
 
				+                theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				+            }
			
 
				+
			
 
				+            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
			
 
				+
			
 
				+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				+
			
 
				+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				+
			
 
				+            const float x0 = src[0];
			
 
				+            const float x1 = src[args.n_dims/2];
			
 
				+
			
 
				+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
			
 
				+        } else {
			
 
				+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
			
 
				+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
			
 
				+
			
 
				+            dst_data[0] = src[0];
			
 
				+            dst_data[1] = src[1];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+kernel void kernel_rope_vision(
			
 
				+        constant ggml_metal_kargs_rope & args,
			
 
				+        device const char * src0,
			
 
				+        device const char * src1,
			
 
				+        device const char * src2,
			
 
				+        device       char * dst,
			
 
				+        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				+        ushort3 tptg [[threads_per_threadgroup]],
			
 
				+        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				+    const int i3 = tgpig[2];
			
 
				+    const int i2 = tgpig[1];
			
 
				+    const int i1 = tgpig[0];
			
 
				+
			
 
				+    float corr_dims[2];
			
 
				+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				+
			
 
				+    device const int32_t * pos = (device const int32_t *) src1;
			
 
				+
			
 
				+    int sect_dims = args.sections[0] + args.sections[1];
			
 
				+    int sec_w = args.sections[1] + args.sections[0];
			
 
				+    int sec_e = args.sections[2] + sec_w;
			
 
				+
			
 
				+    const float inv_ndims = -1.f/args.n_dims;
			
 
				+
			
 
				+    float cos_theta;
			
 
				+    float sin_theta;
			
 
				+
			
 
				+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				+        const int ic = i0/2;
			
 
				+        const int sector = ic % sect_dims;
			
 
				+
			
 
				+        float theta_base = (float) pos[i2];
			
 
				+        if (sector >= args.sections[0] && sector < sec_w) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2];
			
 
				+        }
			
 
				+        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				+        }
			
 
				+        else if (sector >= sec_w + args.sections[2]) {
			
 
				+            theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				+        }
			
 
				+
			
 
				+        int p = sector;
			
 
				+        if (sector >= sec_w + args.sections[2]) {
			
 
				+            p = sector - (sec_w + args.sections[2]);
			
 
				+        } else if (sector >= sec_w) {
			
 
				+            p = sector - sec_w;
			
 
				+        } else if (sector >= args.sections[0]) {
			
 
				+            p = sector - args.sections[0];
			
 
				+        }
			
 
				+
			
 
				+        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
			
 
				+
			
 
				+        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				+
			
 
				+        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				+
			
 
				+        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				+        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				+
			
 
				+        const float x0 = src[0];
			
 
				+        const float x1 = src[args.n_dims];
			
 
				+
			
 
				+        dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				+        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
			
 
				 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
			
 
				+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
			
 
				+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
			
 
				 
			
 
				 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
			
 
				 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
			
@@ -2603,6 +2743,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 
				 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
			
 
				 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
			
 
				 
			
 
				+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
			
 
				+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
			
 
				+
			
 
				+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
			
 
				+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
			
 
				+
			
 
				 typedef void (im2col_t)(
			
 
				         device const float * x,
			
 
				         device        char * dst,
			
--- a/llama/ggml-metal_darwin_arm64.m
+++ b/llama/ggml-metal_darwin_arm64.m
@@ -328,6 +328,10 @@ enum ggml_metal_kernel_type {
 
				     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
			
 
				     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
			
 
				     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
			
 
				+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
			
 
				+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
			
 
				+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
			
 
				+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
			
 
				     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
			
 
				     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
			
 
				     GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
			
@@ -928,6 +932,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
			
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
			
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
			
 
				+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                rope_multi_f32,                 true);
			
 
				+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                rope_multi_f16,                 true);
			
 
				+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,               rope_vision_f32,                true);
			
 
				+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,               rope_vision_f16,                true);
			
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
			
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
			
 
				         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
			
@@ -1155,16 +1163,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
 
				         case GGML_OP_NORM:
			
 
				             return true;
			
 
				         case GGML_OP_ROPE:
			
 
				-            {
			
 
				-                const int mode = ((const int32_t *) op->op_params)[2];
			
 
				-                if (mode & GGML_ROPE_TYPE_MROPE) {
			
 
				-                    return false;
			
 
				-                }
			
 
				-                if (mode & GGML_ROPE_TYPE_VISION) {
			
 
				-                    return false;
			
 
				-                }
			
 
				-                return true;
			
 
				-            }
			
 
				+            return true;
			
 
				         case GGML_OP_IM2COL:
			
 
				             return op->src[0]->type == GGML_TYPE_F16;
			
 
				         case GGML_OP_POOL_1D:
			
@@ -3083,6 +3082,7 @@ static void ggml_metal_encode_node(
 
				                 float attn_factor;
			
 
				                 float beta_fast;
			
 
				                 float beta_slow;
			
 
				+                int32_t sections[4];
			
 
				 
			
 
				                 memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
			
 
				                 memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
			
@@ -3090,21 +3090,44 @@ static void ggml_metal_encode_node(
 
				                 memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
			
 
				                 memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
			
 
				                 memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
			
 
				+                memcpy(&sections,    (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
			
 
				 
			
 
				                 const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
			
 
				+                const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
			
 
				+                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
			
 
				+
			
 
				+                if (is_mrope) {
			
 
				+                    GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
			
 
				+                }
			
 
				+
			
 
				+                if (is_vision) {
			
 
				+                    GGML_ASSERT(n_dims == ne00/2);
			
 
				+                }
			
 
				 
			
 
				                 id<MTLComputePipelineState> pipeline = nil;
			
 
				 
			
 
				-                if (!is_neox) {
			
 
				+                if (is_neox) {
			
 
				                     switch (src0->type) {
			
 
				-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
			
 
				-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
			
 
				+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
			
 
				+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
			
 
				+                        default: GGML_ABORT("fatal error");
			
 
				+                    };
			
 
				+                } else if (is_mrope && !is_vision) {
			
 
				+                    switch (src0->type) {
			
 
				+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
			
 
				+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
			
 
				+                        default: GGML_ABORT("fatal error");
			
 
				+                    };
			
 
				+                } else if (is_vision) {
			
 
				+                    switch (src0->type) {
			
 
				+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
			
 
				+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
			
 
				                         default: GGML_ABORT("fatal error");
			
 
				                     };
			
 
				                 } else {
			
 
				                     switch (src0->type) {
			
 
				-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
			
 
				-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
			
 
				+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
			
 
				+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
			
 
				                         default: GGML_ABORT("fatal error");
			
 
				                     };
			
 
				                 }
			
@@ -3135,6 +3158,7 @@ static void ggml_metal_encode_node(
 
				                     /*.attn_factor =*/ attn_factor,
			
 
				                     /*.beta_fast   =*/ beta_fast,
			
 
				                     /*.beta_slow   =*/ beta_slow,
			
 
				+                    /*.sections    =*/ {sections[0], sections[1], sections[2], sections[3]}
			
 
				                 };
			
 
				 
			
 
				                 [encoder setComputePipelineState:pipeline];
			
--- a/llama/patches/0014-qwen2vl-support.patch
+++ b/llama/patches/0014-qwen2vl-support.patch
@@ -0,0 +1,299 @@
 
				+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
			
 
				+From: jmorganca <jmorganca@gmail.com>
			
 
				+Date: Sun, 15 Dec 2024 23:56:24 -0800
			
 
				+Subject: [PATCH] qwen2vl support
			
 
				+
			
 
				+---
			
 
				+ ggml/src/ggml-metal/ggml-metal-impl.h |   1 +
			
 
				+ ggml/src/ggml-metal/ggml-metal.m      |  54 +++++++---
			
 
				+ ggml/src/ggml-metal/ggml-metal.metal  | 146 ++++++++++++++++++++++++++
			
 
				+ 3 files changed, 186 insertions(+), 15 deletions(-)
			
 
				+
			
 
				+diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
			
 
				+index e3dc25f1..766a4999 100644
			
 
				+--- a/ggml/src/ggml-metal/ggml-metal-impl.h
			
 
				++++ b/ggml/src/ggml-metal/ggml-metal-impl.h
			
 
				+@@ -143,6 +143,7 @@ typedef struct {
			
 
				+     float    attn_factor;
			
 
				+     float    beta_fast;
			
 
				+     float    beta_slow;
			
 
				++    int32_t  sections[4];
			
 
				+ } ggml_metal_kargs_rope;
			
 
				+ 
			
 
				+ typedef struct {
			
 
				+diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
			
 
				+index 787fc713..806c9fd3 100644
			
 
				+--- a/ggml/src/ggml-metal/ggml-metal.m
			
 
				++++ b/ggml/src/ggml-metal/ggml-metal.m
			
 
				+@@ -302,6 +302,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
			
 
				+     GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
			
 
				+     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
			
 
				+     GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
			
 
				++    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
			
 
				++    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
			
 
				++    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
			
 
				++    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
			
 
				+     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
			
 
				+     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
			
 
				+     GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,
			
 
				+@@ -902,6 +906,10 @@ @implementation GGMLMetalClass
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
			
 
				++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                rope_multi_f32,                 true);
			
 
				++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                rope_multi_f16,                 true);
			
 
				++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,               rope_vision_f32,                true);
			
 
				++        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,               rope_vision_f16,                true);
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
			
 
				+         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_EXT_F16,                im2col_ext_f16,                 true);
			
 
				+@@ -1129,16 +1137,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
			
 
				+         case GGML_OP_NORM:
			
 
				+             return true;
			
 
				+         case GGML_OP_ROPE:
			
 
				+-            {
			
 
				+-                const int mode = ((const int32_t *) op->op_params)[2];
			
 
				+-                if (mode & GGML_ROPE_TYPE_MROPE) {
			
 
				+-                    return false;
			
 
				+-                }
			
 
				+-                if (mode & GGML_ROPE_TYPE_VISION) {
			
 
				+-                    return false;
			
 
				+-                }
			
 
				+-                return true;
			
 
				+-            }
			
 
				++            return true;
			
 
				+         case GGML_OP_IM2COL:
			
 
				+             return op->src[0]->type == GGML_TYPE_F16;
			
 
				+         case GGML_OP_POOL_1D:
			
 
				+@@ -3057,6 +3056,7 @@ static void ggml_metal_encode_node(
			
 
				+                 float attn_factor;
			
 
				+                 float beta_fast;
			
 
				+                 float beta_slow;
			
 
				++                int32_t sections[4];
			
 
				+ 
			
 
				+                 memcpy(&freq_base,   (const int32_t *) dst->op_params +  5, sizeof(float));
			
 
				+                 memcpy(&freq_scale,  (const int32_t *) dst->op_params +  6, sizeof(float));
			
 
				+@@ -3064,21 +3064,44 @@ static void ggml_metal_encode_node(
			
 
				+                 memcpy(&attn_factor, (const int32_t *) dst->op_params +  8, sizeof(float));
			
 
				+                 memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
			
 
				+                 memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));
			
 
				++                memcpy(&sections,    (const int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
			
 
				+ 
			
 
				+                 const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
			
 
				++                const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
			
 
				++                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
			
 
				++
			
 
				++                if (is_mrope) {
			
 
				++                    GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
			
 
				++                }
			
 
				++
			
 
				++                if (is_vision) {
			
 
				++                    GGML_ASSERT(n_dims == ne00/2);
			
 
				++                }
			
 
				+ 
			
 
				+                 id<MTLComputePipelineState> pipeline = nil;
			
 
				+ 
			
 
				+-                if (!is_neox) {
			
 
				++                if (is_neox) {
			
 
				+                     switch (src0->type) {
			
 
				+-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
			
 
				+-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
			
 
				++                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
			
 
				++                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
			
 
				++                        default: GGML_ABORT("fatal error");
			
 
				++                    };
			
 
				++                } else if (is_mrope && !is_vision) {
			
 
				++                    switch (src0->type) {
			
 
				++                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
			
 
				++                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
			
 
				++                        default: GGML_ABORT("fatal error");
			
 
				++                    };
			
 
				++                } else if (is_vision) {
			
 
				++                    switch (src0->type) {
			
 
				++                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
			
 
				++                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
			
 
				+                         default: GGML_ABORT("fatal error");
			
 
				+                     };
			
 
				+                 } else {
			
 
				+                     switch (src0->type) {
			
 
				+-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
			
 
				+-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
			
 
				++                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
			
 
				++                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
			
 
				+                         default: GGML_ABORT("fatal error");
			
 
				+                     };
			
 
				+                 }
			
 
				+@@ -3109,6 +3132,7 @@ static void ggml_metal_encode_node(
			
 
				+                     /*.attn_factor =*/ attn_factor,
			
 
				+                     /*.beta_fast   =*/ beta_fast,
			
 
				+                     /*.beta_slow   =*/ beta_slow,
			
 
				++                    /*.sections    =*/ {sections[0], sections[1], sections[2], sections[3]}
			
 
				+                 };
			
 
				+ 
			
 
				+                 [encoder setComputePipelineState:pipeline];
			
 
				+diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
			
 
				+index 204c93e6..67b3240f 100644
			
 
				+--- a/ggml/src/ggml-metal/ggml-metal.metal
			
 
				++++ b/ggml/src/ggml-metal/ggml-metal.metal
			
 
				+@@ -2568,8 +2568,148 @@ kernel void kernel_rope_neox(
			
 
				+     }
			
 
				+ }
			
 
				+ 
			
 
				++
			
 
				++template<typename T>
			
 
				++kernel void kernel_rope_multi(
			
 
				++        constant ggml_metal_kargs_rope & args,
			
 
				++        device const char * src0,
			
 
				++        device const char * src1,
			
 
				++        device const char * src2,
			
 
				++        device       char * dst,
			
 
				++        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				++        ushort3 tptg [[threads_per_threadgroup]],
			
 
				++        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				++    const int i3 = tgpig[2];
			
 
				++    const int i2 = tgpig[1];
			
 
				++    const int i1 = tgpig[0];
			
 
				++
			
 
				++    float corr_dims[2];
			
 
				++    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				++
			
 
				++    device const int32_t * pos = (device const int32_t *) src1;
			
 
				++
			
 
				++    int sect_dims = args.sections[0] + args.sections[1] + args.sections[2] + args.sections[3];
			
 
				++    int sec_w = args.sections[1] + args.sections[0];
			
 
				++
			
 
				++    const float inv_ndims = -1.f/args.n_dims;
			
 
				++
			
 
				++    float cos_theta;
			
 
				++    float sin_theta;
			
 
				++
			
 
				++    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				++        if (i0 < args.n_dims) {
			
 
				++            const int ic = i0/2;
			
 
				++            const int sector = ic % sect_dims;
			
 
				++
			
 
				++            float theta_base = (float) pos[i2];
			
 
				++            if (sector >= args.sections[0] && sector < sec_w) {
			
 
				++                theta_base = (float) pos[i2 + args.ne2];
			
 
				++            }
			
 
				++            else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				++                theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				++            }
			
 
				++            else if (sector >= sec_w + args.sections[2]) {
			
 
				++                theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				++            }
			
 
				++
			
 
				++            float theta = theta_base*pow(args.freq_base, inv_ndims*i0);
			
 
				++
			
 
				++            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				++
			
 
				++            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				++
			
 
				++            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				++            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				++
			
 
				++            const float x0 = src[0];
			
 
				++            const float x1 = src[args.n_dims/2];
			
 
				++
			
 
				++            dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				++            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
			
 
				++        } else {
			
 
				++            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
			
 
				++            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
			
 
				++
			
 
				++            dst_data[0] = src[0];
			
 
				++            dst_data[1] = src[1];
			
 
				++        }
			
 
				++    }
			
 
				++}
			
 
				++
			
 
				++template<typename T>
			
 
				++kernel void kernel_rope_vision(
			
 
				++        constant ggml_metal_kargs_rope & args,
			
 
				++        device const char * src0,
			
 
				++        device const char * src1,
			
 
				++        device const char * src2,
			
 
				++        device       char * dst,
			
 
				++        ushort  tiitg[[thread_index_in_threadgroup]],
			
 
				++        ushort3 tptg [[threads_per_threadgroup]],
			
 
				++        uint3   tgpig[[threadgroup_position_in_grid]]) {
			
 
				++    const int i3 = tgpig[2];
			
 
				++    const int i2 = tgpig[1];
			
 
				++    const int i1 = tgpig[0];
			
 
				++
			
 
				++    float corr_dims[2];
			
 
				++    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
			
 
				++
			
 
				++    device const int32_t * pos = (device const int32_t *) src1;
			
 
				++
			
 
				++    int sect_dims = args.sections[0] + args.sections[1];
			
 
				++    int sec_w = args.sections[1] + args.sections[0];
			
 
				++    int sec_e = args.sections[2] + sec_w;
			
 
				++
			
 
				++    const float inv_ndims = -1.f/args.n_dims;
			
 
				++
			
 
				++    float cos_theta;
			
 
				++    float sin_theta;
			
 
				++
			
 
				++    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
			
 
				++        const int ic = i0/2;
			
 
				++        const int sector = ic % sect_dims;
			
 
				++
			
 
				++        float theta_base = (float) pos[i2];
			
 
				++        if (sector >= args.sections[0] && sector < sec_w) {
			
 
				++            theta_base = (float) pos[i2 + args.ne2];
			
 
				++        }
			
 
				++        else if (sector >= sec_w && sector < sec_w + args.sections[2]) {
			
 
				++            theta_base = (float) pos[i2 + args.ne2 * 2];
			
 
				++        }
			
 
				++        else if (sector >= sec_w + args.sections[2]) {
			
 
				++            theta_base = (float) pos[i2 + args.ne2 * 3];
			
 
				++        }
			
 
				++
			
 
				++        int p = sector;
			
 
				++        if (sector >= sec_w + args.sections[2]) {
			
 
				++            p = sector - (sec_w + args.sections[2]);
			
 
				++        } else if (sector >= sec_w) {
			
 
				++            p = sector - sec_w;
			
 
				++        } else if (sector >= args.sections[0]) {
			
 
				++            p = sector - args.sections[0];
			
 
				++        }
			
 
				++
			
 
				++        const float theta = theta_base*pow(args.freq_base, inv_ndims*2.0f*p);
			
 
				++
			
 
				++        const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
			
 
				++
			
 
				++        rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
			
 
				++
			
 
				++        device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
			
 
				++        device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
			
 
				++
			
 
				++        const float x0 = src[0];
			
 
				++        const float x1 = src[args.n_dims];
			
 
				++
			
 
				++        dst_data[0]             = x0*cos_theta - x1*sin_theta;
			
 
				++        dst_data[args.n_dims]   = x0*sin_theta + x1*cos_theta;
			
 
				++    }
			
 
				++}
			
 
				++
			
 
				++
			
 
				+ typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
			
 
				+ typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
			
 
				++typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
			
 
				++typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;
			
 
				+ 
			
 
				+ template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
			
 
				+ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
			
 
				+@@ -2577,6 +2717,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
			
 
				+ template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
			
 
				+ template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
			
 
				+ 
			
 
				++template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
			
 
				++template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
			
 
				++
			
 
				++template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
			
 
				++template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
			
 
				++
			
 
				+ typedef void (im2col_t)(
			
 
				+         device const float * x,
			
 
				+         device        char * dst,