2 mēneši atpakaļ · 5b446cc815
--- a/.gitattributes
+++ b/.gitattributes
@@ -15,6 +15,10 @@ ml/backend/**/*.cu linguist-vendored
 
				 ml/backend/**/*.cuh linguist-vendored
			
 
				 ml/backend/**/*.m linguist-vendored
			
 
				 ml/backend/**/*.metal linguist-vendored
			
 
				+ml/backend/**/CMakeLists.txt linguist-vendored
			
 
				+
			
 
				+llama/build-info.cpp linguist-generated
			
 
				+ml/backend/ggml/ggml/src/ggml-metal/ggml-metal-embed.s linguist-generated
			
 
				 
			
 
				 * text=auto
			
 
				 *.go text eol=lf
			
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -163,5 +163,5 @@ jobs:
 
				       - uses: actions/checkout@v4
			
 
				       - name: Verify patches apply cleanly and do not change files
			
 
				         run: |
			
 
				-          make -f Makefile.sync clean checkout sync
			
 
				+          make -f Makefile.sync clean sync
			
 
				           git diff --compact-summary --exit-code
			
--- a/Makefile.sync
+++ b/Makefile.sync
@@ -15,7 +15,11 @@ help:
 
				 	@echo "    make -f $(lastword $(MAKEFILE_LIST)) clean sync"
			
 
				 
			
 
				 .PHONY: sync
			
 
				-sync: llama/llama.cpp ml/backend/ggml/ggml apply-patches
			
 
				+sync: llama/build-info.cpp llama/llama.cpp ml/backend/ggml/ggml apply-patches
			
 
				+
			
 
				+.PHONY: llama/build-info.cpp
			
 
				+llama/build-info.cpp: llama/build-info.cpp.in
			
 
				+	sed -e 's|@FETCH_HEAD@|$(FETCH_HEAD)|' $< > $@
			
 
				 
			
 
				 .PHONY: llama/llama.cpp
			
 
				 llama/llama.cpp: llama/vendor/ apply-patches
			
--- a/llama/build-info.cpp
+++ b/llama/build-info.cpp
@@ -1,4 +1,4 @@
 
				 int LLAMA_BUILD_NUMBER = 0;
			
 
				-char const *LLAMA_COMMIT = "ba1cb19cdd0d92e012e0f6e009e0620f854b6afd";
			
 
				+char const *LLAMA_COMMIT = "46e3556e01b824e52395fb050b29804b6cff2a7c";
			
 
				 char const *LLAMA_COMPILER = "";
			
 
				 char const *LLAMA_BUILD_TARGET = "";
			
--- a/llama/build-info.cpp.in
+++ b/llama/build-info.cpp.in
@@ -0,0 +1,4 @@
 
				+int LLAMA_BUILD_NUMBER = 0;
			
 
				+char const *LLAMA_COMMIT = "@FETCH_HEAD@";
			
 
				+char const *LLAMA_COMPILER = "";
			
 
				+char const *LLAMA_BUILD_TARGET = "";
			
--- a/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ml/backend/ggml/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -1,77 +0,0 @@
 
				-#!/usr/bin/env python3
			
 
				-
			
 
				-from glob import glob
			
 
				-import os
			
 
				-
			
 
				-TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
			
 
				-
			
 
				-SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f{vkq_size}.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F{vkq_size}_CASE({head_size}, {type_k}, {type_v});
			
 
				-"""
			
 
				-
			
 
				-SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-wmma-f16.cuh"
			
 
				-
			
 
				-"""
			
 
				-
			
 
				-SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
			
 
				-
			
 
				-TYPES_MMQ = [
			
 
				-    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
			
 
				-    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
			
 
				-    "GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
			
 
				-    "GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
			
 
				-]
			
 
				-
			
 
				-SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../mmq.cuh"
			
 
				-
			
 
				-DECL_MMQ_CASE({type});
			
 
				-"""
			
 
				-
			
 
				-
			
 
				-def get_short_name(long_quant_name):
			
 
				-    return long_quant_name.replace("GGML_TYPE_", "").lower()
			
 
				-
			
 
				-
			
 
				-def get_head_sizes(type_k, type_v):
			
 
				-    if type_k == "GGML_TYPE_F16" and type_v == "GGML_TYPE_F16":
			
 
				-        return [64, 128, 256]
			
 
				-    if type_k == "GGML_TYPE_F16":
			
 
				-        return [64, 128]
			
 
				-    return [128]
			
 
				-
			
 
				-
			
 
				-for filename in glob("*.cu"):
			
 
				-    os.remove(filename)
			
 
				-
			
 
				-for vkq_size in [16, 32]:
			
 
				-    for type_k in TYPES_KV:
			
 
				-        for type_v in TYPES_KV:
			
 
				-            for head_size in get_head_sizes(type_k, type_v):
			
 
				-                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
			
 
				-                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))
			
 
				-
			
 
				-for kq_acc_t in ["half", "float"]:
			
 
				-    for cols_per_block in [8, 16, 32]:
			
 
				-        if kq_acc_t == "float" and cols_per_block == 8:
			
 
				-            continue
			
 
				-
			
 
				-        with open(f"fattn-wmma-f16-instance-kq{kq_acc_t}-cpb{cols_per_block}.cu", "w") as f:
			
 
				-            f.write(SOURCE_FATTN_WMMA_START)
			
 
				-
			
 
				-            for head_size in [64, 80, 96, 112, 128, 256]:
			
 
				-                if cols_per_block == 8 and head_size % 32 != 0: # wmma fragment is 8x32
			
 
				-                    continue
			
 
				-                if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
			
 
				-                    continue
			
 
				-                f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
			
 
				-
			
 
				-for type in TYPES_MMQ:
			
 
				-    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
			
 
				-        f.write(SOURCE_MMQ.format(type=type))