0003-metal-add-missing-barriers-for-mul-mat-2699.patch 1.3 KB

1234567891011121314151617181920212223242526272829303132
  1. From 8c0ea847ac1460bca534d92266e3471cb31471be Mon Sep 17 00:00:00 2001
  2. From: Bruce MacDonald <brucewmacdonald@gmail.com>
  3. Date: Tue, 5 Sep 2023 16:05:08 -0400
  4. Subject: [PATCH] metal: add missing barriers for mul-mat #2699
  5. ---
  6. ggml-metal.metal | 2 ++
  7. 1 file changed, 2 insertions(+)
  8. diff --git a/ggml-metal.metal b/ggml-metal.metal
  9. index 3f31252..ce3541f 100644
  10. --- a/ggml-metal.metal
  11. +++ b/ggml-metal.metal
  12. @@ -1850,6 +1850,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
  13. //load data and store to threadgroup memory
  14. half4x4 temp_a;
  15. dequantize_func(x, il, temp_a);
  16. + threadgroup_barrier(mem_flags::mem_threadgroup);
  17. #pragma unroll(16)
  18. for (int i = 0; i < 16; i++) {
  19. *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
  20. @@ -1895,6 +1896,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
  21. }
  22. } else {
  23. // block is smaller than 64x32, we should avoid writing data outside of the matrix
  24. + threadgroup_barrier(mem_flags::mem_threadgroup);
  25. threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
  26. + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
  27. for (int i = 0; i < 8; i++) {
  28. --
  29. 2.39.2 (Apple Git-143)