瀏覽代碼

use matrix multiplcation kernels in more cases

jmorganca 1 年之前
父節點
當前提交
27d88dbfdc
共有 1 個文件被更改,包括 45 次插入0 次删除
  1. 45 0
      llm/patches/04-metal.diff

+ 45 - 0
llm/patches/04-metal.diff

@@ -0,0 +1,45 @@
+diff --git a/ggml-metal.m b/ggml-metal.m
+index 0207b787..b5e9884b 100644
+--- a/ggml-metal.m
++++ b/ggml-metal.m
+@@ -1396,27 +1396,23 @@ static enum ggml_status ggml_metal_graph_compute(
+                         // to the matrix-vector kernel
+                         int ne11_mm_min = 1;
+ 
+-#if 0
+                         // the numbers below are measured on M2 Ultra for 7B and 13B models
+                         // these numbers do not translate to other devices or model sizes
+                         // TODO: need to find a better approach
+-                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+-                            switch (src0t) {
+-                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+-                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+-                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q4_0:
+-                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+-                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+-                                case GGML_TYPE_Q5_0:                          // not tested yet
+-                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+-                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+-                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+-                                default:             ne11_mm_min = 1;  break;
+-                            }
++                        switch (src0t) {
++                            case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
++                            case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
++                            case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
++                            case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
++                            case GGML_TYPE_Q4_0:
++                            case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
++                            case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
++                            case GGML_TYPE_Q5_0:                          // not tested yet
++                            case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
++                            case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
++                            case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
++                            default:             ne11_mm_min = 1;  break;
+                         }
+-#endif
+ 
+                         // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                         // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel