5 months ago · 8cb7b94c40
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,5 +7,14 @@ llama/**/*.cuh linguist-vendored
 
				 llama/**/*.m linguist-vendored
			
 
				 llama/**/*.metal linguist-vendored
			
 
				 
			
 
				+ml/backend/**/*.c linguist-vendored
			
 
				+ml/backend/**/*.h linguist-vendored
			
 
				+ml/backend/**/*.cpp linguist-vendored
			
 
				+ml/backend/**/*.hpp linguist-vendored
			
 
				+ml/backend/**/*.cu linguist-vendored
			
 
				+ml/backend/**/*.cuh linguist-vendored
			
 
				+ml/backend/**/*.m linguist-vendored
			
 
				+ml/backend/**/*.metal linguist-vendored
			
 
				+
			
 
				 * text=auto
			
 
				 *.go text eol=lf
			
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,45 @@
 
				+cmake_minimum_required(VERSION 3.21)
			
 
				+
			
 
				+project(Ollama C CXX)
			
 
				+
			
 
				+include(CheckLanguage)
			
 
				+
			
 
				+find_package(Threads REQUIRED)
			
 
				+
			
 
				+set(CMAKE_BUILD_TYPE Release)
			
 
				+set(BUILD_SHARED_LIBS ON)
			
 
				+
			
 
				+set(CMAKE_CXX_STANDARD 17)
			
 
				+set(CMAKE_CXX_STANDARD_REQUIRED ON)
			
 
				+set(CMAKE_CXX_EXTENSIONS OFF)
			
 
				+
			
 
				+set(GGML_BUILD ON)
			
 
				+set(GGML_SHARED ON)
			
 
				+set(GGML_CCACHE ON)
			
 
				+set(GGML_BACKEND_DL ON)
			
 
				+set(GGML_BACKEND_SHARED ON)
			
 
				+set(GGML_SCHED_MAX_COPIES 4)
			
 
				+set(GGML_CPU_ALL_VARIANTS ON)
			
 
				+set(GGML_CUDA_PEER_MAX_BATCH_SIZE 128)
			
 
				+set(GGML_LLAMAFILE ON)
			
 
				+
			
 
				+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
			
 
				+
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include)
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)
			
 
				+
			
 
				+set(GGML_CPU ON)
			
 
				+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
			
 
				+set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
			
 
				+
			
 
				+check_language(CUDA)
			
 
				+if(CMAKE_CUDA_COMPILER)
			
 
				+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cuda)
			
 
				+endif()
			
 
				+
			
 
				+check_language(HIP)
			
 
				+if(CMAKE_HIP_COMPILER)
			
 
				+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-hip)
			
 
				+endif()
			
--- a/Dockerfile2
+++ b/Dockerfile2
@@ -0,0 +1,59 @@
 
				+ARG CUDA_11_VERSION=11.3
			
 
				+ARG CUDA_12_VERSION=12.4
			
 
				+ARG ROCM_VERSION=6.1.2
			
 
				+ARG JETPACK_5_VERSION=r35.4.1
			
 
				+ARG JETPACK_6_VERSION=r36.2.0
			
 
				+ARG CMAKE_VERSION=3.31.2
			
 
				+
			
 
				+FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS base
			
 
				+ARG CMAKE_VERSION
			
 
				+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz | tar xz -C /usr --strip-components 1
			
 
				+RUN sed -i -e 's/mirror.centos.org/vault.centos.org/g' -e 's/^#.*baseurl=http/baseurl=http/g' -e 's/^mirrorlist=http/#mirrorlist=http/g' /etc/yum.repos.d/*.repo \
			
 
				+    && yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
			
 
				+
			
 
				+# FROM --platform=linux/arm64 rockylinux:8 AS base
			
 
				+# ARG CMAKE_VERSION
			
 
				+# RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
			
 
				+# RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo
			
 
				+
			
 
				+FROM base AS amd64
			
 
				+ARG CUDA_11_VERSION
			
 
				+ARG CUDA_12_VERSION
			
 
				+RUN yum install -y cuda-toolkit-${CUDA_11_VERSION//./-} \
			
 
				+    && yum install -y cuda-toolkit-${CUDA_12_VERSION//./-}
			
 
				+COPY CMakeLists.txt CMakeLists.txt
			
 
				+COPY ml/backend/ggml/ggml ml/backend/ggml/ggml
			
 
				+
			
 
				+FROM --platform=linux/amd64 amd64 AS cuda_11
			
 
				+ENV PATH=/usr/local/cuda-${CUDA_11_VERSION}/bin:$PATH
			
 
				+RUN cmake --build --parallel --preset 'CUDA 11'
			
 
				+
			
 
				+FROM --platform=linux/amd64 amd64 AS cuda_12
			
 
				+ENV PATH=/usr/local/cuda-${CUDA_12_VERSION}/bin:$PATH
			
 
				+RUN cmake --build --parallel --preset 'CUDA 11'
			
 
				+
			
 
				+FROM --platform=linux/amd64 amd64 AS rocm
			
 
				+RUN cmake --build --parallel --preset 'ROCm 6'
			
 
				+
			
 
				+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5_VERSION} AS jetpack_5
			
 
				+ARG CMAKE_VERSION
			
 
				+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
			
 
				+COPY CMakeLists.txt .
			
 
				+COPY ml/backend/ggml/ggml .
			
 
				+RUN cmake --build --parallel --preset 'JetPack 5'
			
 
				+
			
 
				+FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6_VERSION} AS jetpack_6
			
 
				+ARG CMAKE_VERSION
			
 
				+RUN curl -fsSL https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz | tar xz -C /usr --strip-components 1
			
 
				+COPY CMakeLists.txt .
			
 
				+COPY ml/backend/ggml/ggml .
			
 
				+RUN cmake --build --parallel --preset 'JetPack 6'
			
 
				+
			
 
				+FROM --platform=linux/amd64 golang:1.23
			
 
				+COPY --from=cuda_11 build/libggml-cuda.so libggml-cuda-11.so
			
 
				+COPY --from=cuda_12 build/libggml-cuda.so libggml-cuda-12.so
			
 
				+COPY --from=rocm build/libggml-hip.so libggml-hip.so
			
 
				+
			
 
				+# FROM --platform=linux/arm64 golang:1.23
			
 
				+# COPY --from=jetpack_5 build/libggml-cuda.so libggml-cuda-jetpack-5.so
			
 
				+# COPY --from=jetpack_6 build/libggml-cuda.so libggml-cuda-jetpack-6.so
			
--- a/Makefile
+++ b/Makefile
@@ -1,103 +0,0 @@
 
				-# top level makefile for Ollama
			
 
				-include make/common-defs.make
			
 
				-
			
 
				-
			
 
				-# Determine which if any GPU runners we should build
			
 
				-include make/cuda-v11-defs.make
			
 
				-include make/cuda-v12-defs.make
			
 
				-include make/rocm-defs.make
			
 
				-
			
 
				-ifeq ($(CUSTOM_CPU_FLAGS),)
			
 
				-ifeq ($(ARCH),amd64)
			
 
				-	RUNNER_TARGETS=cpu
			
 
				-endif
			
 
				-# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
			
 
				-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
			
 
				-ifneq ($(CUDA_11_COMPILER),)
			
 
				-	RUNNER_TARGETS += cuda_v11
			
 
				-endif
			
 
				-ifneq ($(CUDA_12_COMPILER),)
			
 
				-	RUNNER_TARGETS += cuda_v12
			
 
				-endif
			
 
				-endif
			
 
				-else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
			
 
				-ifneq ($(CUDA_12_COMPILER),)
			
 
				-	RUNNER_TARGETS += cuda_v12
			
 
				-else ifneq ($(CUDA_11_COMPILER),)
			
 
				-	RUNNER_TARGETS += cuda_v11
			
 
				-endif
			
 
				-endif
			
 
				-
			
 
				-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
			
 
				-ifneq ($(HIP_COMPILER),)
			
 
				-	RUNNER_TARGETS += rocm
			
 
				-endif
			
 
				-endif
			
 
				-
			
 
				-
			
 
				-all: runners exe
			
 
				-
			
 
				-dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
			
 
				-
			
 
				-dist_%:
			
 
				-	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
			
 
				-
			
 
				-runners: $(RUNNER_TARGETS)
			
 
				-
			
 
				-$(RUNNER_TARGETS):
			
 
				-	@$(MAKE) --no-print-directory -f make/Makefile.$@
			
 
				-
			
 
				-exe dist_exe:
			
 
				-	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
			
 
				-
			
 
				-help-sync apply-patches create-patches sync sync-clean:
			
 
				-	@$(MAKE) --no-print-directory -f make/Makefile.sync $@
			
 
				-
			
 
				-test integration lint:
			
 
				-	@$(MAKE) --no-print-directory -f make/Makefile.test $@
			
 
				-
			
 
				-clean:
			
 
				-	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
			
 
				-	go clean -cache
			
 
				-
			
 
				-help:
			
 
				-	@echo "The following make targets will help you build Ollama"
			
 
				-	@echo ""
			
 
				-	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
			
 
				-	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
			
 
				-	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
			
 
				-	@echo "	make dist		# Build the runners and primary ollama executable for distribution"
			
 
				-	@echo "	make help-sync 		# Help information on vendor update targets"
			
 
				-	@echo "	make help-runners 	# Help information on runner targets"
			
 
				-	@echo ""
			
 
				-	@echo "The following make targets will help you test Ollama"
			
 
				-	@echo ""
			
 
				-	@echo "	make test   		# Run unit tests"
			
 
				-	@echo "	make integration	# Run integration tests.  You must 'make all' first"
			
 
				-	@echo "	make lint   		# Run lint and style tests"
			
 
				-	@echo ""
			
 
				-	@echo "For more information see 'docs/development.md'"
			
 
				-	@echo ""
			
 
				-
			
 
				-
			
 
				-help-runners:
			
 
				-	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
			
 
				-	@echo ""
			
 
				-	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
			
 
				-	@echo ""
			
 
				-	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
			
 
				-	@echo "CUDA_PATH=$(CUDA_PATH)"
			
 
				-	@echo "	CUDA_11_PATH=$(CUDA_11_PATH)"
			
 
				-	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
			
 
				-	@echo "	CUDA_12_PATH=$(CUDA_12_PATH)"
			
 
				-	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
			
 
				-	@echo ""
			
 
				-	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
			
 
				-	@echo "HIP_PATH=$(HIP_PATH)"
			
 
				-	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
			
 
				-
			
 
				-.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
			
 
				-
			
 
				-# Handy debugging for make variables
			
 
				-print-%:
			
 
				-	@echo '$*=$($*)'
			
--- a/Makefile2
+++ b/Makefile2
@@ -0,0 +1,46 @@
 
				+UPSTREAM=https://github.com/ggerganov/llama.cpp.git
			
 
				+WORKDIR=llama/vendor
			
 
				+FETCH_HEAD=46e3556e01b824e52395fb050b29804b6cff2a7c
			
 
				+
			
 
				+all: sync
			
 
				+
			
 
				+.PHONY: sync
			
 
				+sync: llama/llama.cpp ml/backend/ggml/ggml
			
 
				+
			
 
				+.PHONY: llama/llama.cpp
			
 
				+llama/llama.cpp: llama/vendor/ apply_patches
			
 
				+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
			
 
				+
			
 
				+.PHONY: ml/backend/ggml/ggml apply_patches
			
 
				+ml/backend/ggml/ggml: llama/vendor/ggml/ apply_patches
			
 
				+	rsync -arvzc -f "merge $@/.rsync-filter" $< $@
			
 
				+
			
 
				+PATCHES=$(wildcard llama/patches/*.patch)
			
 
				+
			
 
				+.PHONY: apply_patches
			
 
				+.NOTPARALLEL:
			
 
				+apply_patches: $(addsuffix ed, $(PATCHES))
			
 
				+
			
 
				+%.patched: %.patch
			
 
				+	@if git -c user.name=nobody -c 'user.email=<>' -C $(WORKDIR) am -3 $(realpath $<); then touch $@; else git -C $(WORKDIR) am --abort; exit 1; fi
			
 
				+
			
 
				+.PHONY: checkout
			
 
				+checkout: $(WORKDIR)
			
 
				+	git -C $(WORKDIR) fetch
			
 
				+	git -C $(WORKDIR) checkout -f $(FETCH_HEAD)
			
 
				+
			
 
				+$(WORKDIR):
			
 
				+	git clone $(UPSTREAM) $(WORKDIR)
			
 
				+
			
 
				+.PHONE: format_patches
			
 
				+format_patches: llama/patches
			
 
				+	git -C $(WORKDIR) format-patch \
			
 
				+		--no-signature \
			
 
				+		--no-numbered \
			
 
				+		--zero-commit \
			
 
				+		-o $(realpath $<) \
			
 
				+		$(FETCH_HEAD)
			
 
				+
			
 
				+.PHONE: clean
			
 
				+clean: checkout
			
 
				+	$(RM) $(addsuffix ed, $(PATCHES))
			
--- a/cache/cache.go
+++ b/cache/cache.go
@@ -0,0 +1,63 @@
 
				+package cache
			
 
				+
			
 
				+import (
			
 
				+	"github.com/ollama/ollama/ml"
			
 
				+)
			
 
				+
			
 
				+type Options struct {
			
 
				+	Position int
			
 
				+}
			
 
				+
			
 
				+type Cache interface {
			
 
				+	Sub(i int) Cache
			
 
				+	Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor)
			
 
				+}
			
 
				+
			
 
				+type Simple struct {
			
 
				+	DType    ml.DType
			
 
				+	Capacity int
			
 
				+
			
 
				+	keys, values []ml.Tensor
			
 
				+}
			
 
				+
			
 
				+func (c *Simple) Sub(i int) Cache {
			
 
				+	if i >= len(c.keys) {
			
 
				+		c.keys = append(c.keys, make([]ml.Tensor, i-len(c.keys)+1)...)
			
 
				+		c.values = append(c.values, make([]ml.Tensor, i-len(c.values)+1)...)
			
 
				+	}
			
 
				+
			
 
				+	return &Simple{
			
 
				+		keys:     c.keys[i : i+1],
			
 
				+		values:   c.values[i : i+1],
			
 
				+		Capacity: c.Capacity,
			
 
				+		DType:    c.DType,
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (c *Simple) Put(ctx ml.Context, key, value ml.Tensor, opts Options) (ml.Tensor, ml.Tensor) {
			
 
				+	if c.keys[0] == nil || c.values[0] == nil {
			
 
				+		c.keys[0] = ctx.Zeros(c.DType, int(key.Dim(0)*key.Dim(1))*c.Capacity)
			
 
				+		c.values[0] = ctx.Zeros(c.DType, int(value.Dim(0)*value.Dim(1))*c.Capacity)
			
 
				+	}
			
 
				+
			
 
				+	ctx.Forward(key.Copy(ctx, c.keys[0].View(ctx, int(key.Stride(2))*opts.Position, int(key.Dim(0)*key.Dim(1)*key.Dim(2)))))
			
 
				+	ctx.Forward(value.Copy(ctx, c.values[0].View(ctx, int(value.Stride(2))*opts.Position, int(value.Dim(0)*value.Dim(1)*value.Dim(2)))))
			
 
				+
			
 
				+	n := min(c.Capacity, int(key.Dim(2))+opts.Position)
			
 
				+
			
 
				+	key = c.keys[0].View(ctx, 0,
			
 
				+		int(key.Dim(0)), int(key.Stride(1)),
			
 
				+		int(key.Dim(1)), int(key.Stride(2)),
			
 
				+		n,
			
 
				+	)
			
 
				+
			
 
				+	value = c.values[0].View(ctx, 0,
			
 
				+		int(value.Dim(0)), int(value.Stride(1)),
			
 
				+		int(value.Dim(1)), int(value.Stride(2)),
			
 
				+		n,
			
 
				+	)
			
 
				+
			
 
				+	// TODO shift context if necessary
			
 
				+
			
 
				+	return key, value
			
 
				+}
			
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -9,7 +9,7 @@ import (
 
				 	"log/slog"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type ModelParameters struct {
			
@@ -27,8 +27,8 @@ type AdapterParameters struct {
 
				 	} `json:"lora_parameters"`
			
 
				 }
			
 
				 
			
 
				-func (ModelParameters) KV(t *Tokenizer) llm.KV {
			
 
				-	kv := llm.KV{
			
 
				+func (ModelParameters) KV(t *Tokenizer) ggml.KV {
			
 
				+	kv := ggml.KV{
			
 
				 		"general.file_type":            uint32(1),
			
 
				 		"general.quantization_version": uint32(2),
			
 
				 		"tokenizer.ggml.pre":           t.Pre,
			
@@ -54,7 +54,7 @@ func (ModelParameters) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p AdapterParameters) KV() llm.KV {
			
 
				+func (p AdapterParameters) KV() ggml.KV {
			
 
				 	var alpha float32
			
 
				 	if p.LoraParameters.Alpha == 0 {
			
 
				 		alpha = float32(p.Alpha)
			
@@ -62,7 +62,7 @@ func (p AdapterParameters) KV() llm.KV {
 
				 		alpha = p.LoraParameters.Alpha
			
 
				 	}
			
 
				 
			
 
				-	kv := llm.KV{
			
 
				+	kv := ggml.KV{
			
 
				 		"adapter.lora.alpha": alpha,
			
 
				 		"adapter.type":       "lora",
			
 
				 		"general.file_type":  uint32(1),
			
@@ -79,19 +79,19 @@ func (ModelParameters) specialTokenTypes() []string {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (ModelParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
			
 
				-	return llm.WriteGGUF(ws, kv, ts)
			
 
				+func (ModelParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
			
 
				+	return ggml.WriteGGUF(ws, kv, ts)
			
 
				 }
			
 
				 
			
 
				-func (AdapterParameters) writeFile(ws io.WriteSeeker, kv llm.KV, ts []llm.Tensor) error {
			
 
				-	return llm.WriteGGUF(ws, kv, ts)
			
 
				+func (AdapterParameters) writeFile(ws io.WriteSeeker, kv ggml.KV, ts []ggml.Tensor) error {
			
 
				+	return ggml.WriteGGUF(ws, kv, ts)
			
 
				 }
			
 
				 
			
 
				 type ModelConverter interface {
			
 
				 	// KV maps parameters to LLM key-values
			
 
				-	KV(*Tokenizer) llm.KV
			
 
				+	KV(*Tokenizer) ggml.KV
			
 
				 	// Tensors maps input tensors to LLM tensors. Model specific modifications can be done here.
			
 
				-	Tensors([]Tensor) []llm.Tensor
			
 
				+	Tensors([]Tensor) []ggml.Tensor
			
 
				 	// Replacements returns a list of string pairs to replace in tensor names.
			
 
				 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
			
 
				 	Replacements() []string
			
@@ -99,7 +99,7 @@ type ModelConverter interface {
 
				 	// specialTokenTypes returns any special token types the model uses
			
 
				 	specialTokenTypes() []string
			
 
				 	// writeFile writes the model to the provided io.WriteSeeker
			
 
				-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
			
 
				+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
			
 
				 }
			
 
				 
			
 
				 type moreParser interface {
			
@@ -108,17 +108,17 @@ type moreParser interface {
 
				 
			
 
				 type AdapterConverter interface {
			
 
				 	// KV maps parameters to LLM key-values
			
 
				-	KV(llm.KV) llm.KV
			
 
				+	KV(ggml.KV) ggml.KV
			
 
				 	// Tensors maps input tensors to LLM tensors. Adapter specific modifications can be done here.
			
 
				-	Tensors([]Tensor) []llm.Tensor
			
 
				+	Tensors([]Tensor) []ggml.Tensor
			
 
				 	// Replacements returns a list of string pairs to replace in tensor names.
			
 
				 	// See [strings.Replacer](https://pkg.go.dev/strings#Replacer) for details
			
 
				 	Replacements() []string
			
 
				 
			
 
				-	writeFile(io.WriteSeeker, llm.KV, []llm.Tensor) error
			
 
				+	writeFile(io.WriteSeeker, ggml.KV, []ggml.Tensor) error
			
 
				 }
			
 
				 
			
 
				-func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV llm.KV) error {
			
 
				+func ConvertAdapter(fsys fs.FS, ws io.WriteSeeker, baseKV ggml.KV) error {
			
 
				 	bts, err := fs.ReadFile(fsys, "adapter_config.json")
			
 
				 	if err != nil {
			
 
				 		return err
			
--- a/convert/convert_bert.go
+++ b/convert/convert_bert.go
@@ -8,7 +8,7 @@ import (
 
				 	"slices"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type bertModel struct {
			
@@ -85,7 +85,7 @@ func (p *bertModel) parseMore(fsys fs.FS) error {
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-func (p *bertModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *bertModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "bert"
			
 
				 	kv["bert.attention.causal"] = false
			
@@ -132,8 +132,8 @@ func (p *bertModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *bertModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		if slices.Contains([]string{
			
 
				 			"embeddings.position_ids",
			
@@ -143,7 +143,7 @@ func (p *bertModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 			continue
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_commandr.go
+++ b/convert/convert_commandr.go
@@ -3,7 +3,7 @@ package convert
 
				 import (
			
 
				 	"cmp"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type commandrModel struct {
			
@@ -24,7 +24,7 @@ type commandrModel struct {
 
				 
			
 
				 var _ ModelConverter = (*commandrModel)(nil)
			
 
				 
			
 
				-func (p *commandrModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *commandrModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "command-r"
			
 
				 	kv["general.name"] = "command-r"
			
@@ -43,10 +43,10 @@ func (p *commandrModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *commandrModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *commandrModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_gemma.go
+++ b/convert/convert_gemma.go
@@ -6,7 +6,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type gemmaModel struct {
			
@@ -23,7 +23,7 @@ type gemmaModel struct {
 
				 
			
 
				 var _ ModelConverter = (*gemmaModel)(nil)
			
 
				 
			
 
				-func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *gemmaModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "gemma"
			
 
				 	kv["gemma.context_length"] = p.MaxPositionEmbeddings
			
@@ -42,14 +42,14 @@ func (p *gemmaModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *gemmaModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *gemmaModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		if strings.HasSuffix(t.Name(), "_norm.weight") {
			
 
				 			t.SetRepacker(p.addOne)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_gemma2.go
+++ b/convert/convert_gemma2.go
@@ -1,8 +1,6 @@
 
				 package convert
			
 
				 
			
 
				-import (
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				-)
			
 
				+import "github.com/ollama/ollama/fs/ggml"
			
 
				 
			
 
				 type gemma2Model struct {
			
 
				 	gemmaModel
			
@@ -11,7 +9,7 @@ type gemma2Model struct {
 
				 	FinalLogitSoftcap     float32 `json:"final_logit_softcapping"`
			
 
				 }
			
 
				 
			
 
				-func (p *gemma2Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *gemma2Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "gemma2"
			
 
				 	kv["gemma2.context_length"] = p.MaxPositionEmbeddings
			
--- a/convert/convert_gemma2_adapter.go
+++ b/convert/convert_gemma2_adapter.go
@@ -6,7 +6,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type gemma2Adapter struct {
			
@@ -15,14 +15,14 @@ type gemma2Adapter struct {
 
				 
			
 
				 var _ AdapterConverter = (*gemma2Adapter)(nil)
			
 
				 
			
 
				-func (p *gemma2Adapter) KV(baseKV llm.KV) llm.KV {
			
 
				+func (p *gemma2Adapter) KV(baseKV ggml.KV) ggml.KV {
			
 
				 	kv := p.AdapterParameters.KV()
			
 
				 	kv["general.architecture"] = "gemma2"
			
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *gemma2Adapter) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		shape := t.Shape()
			
 
				 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
			
@@ -31,7 +31,7 @@ func (p *gemma2Adapter) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -9,7 +9,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type llamaModel struct {
			
@@ -46,7 +46,7 @@ type llamaModel struct {
 
				 
			
 
				 var _ ModelConverter = (*llamaModel)(nil)
			
 
				 
			
 
				-func (p *llamaModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *llamaModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "llama"
			
 
				 	kv["llama.vocab_size"] = p.VocabSize
			
@@ -120,11 +120,11 @@ func (p *llamaModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *llamaModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 
			
 
				 	if p.RopeScaling.factors != nil {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     "rope_freqs.weight",
			
 
				 			Kind:     0,
			
 
				 			Shape:    []uint64{uint64(len(p.RopeScaling.factors))},
			
@@ -138,7 +138,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_llama_adapter.go
+++ b/convert/convert_llama_adapter.go
@@ -7,7 +7,7 @@ import (
 
				 	"github.com/pdevine/tensor"
			
 
				 	"github.com/pdevine/tensor/native"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type llamaAdapter struct {
			
@@ -18,7 +18,7 @@ type llamaAdapter struct {
 
				 
			
 
				 var _ AdapterConverter = (*llamaAdapter)(nil)
			
 
				 
			
 
				-func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
			
 
				+func (p *llamaAdapter) KV(baseKV ggml.KV) ggml.KV {
			
 
				 	kv := p.AdapterParameters.KV()
			
 
				 	kv["general.architecture"] = "llama"
			
 
				 	kv["llama.attention.head_count"] = baseKV["llama.attention.head_count"]
			
@@ -29,8 +29,8 @@ func (p *llamaAdapter) KV(baseKV llm.KV) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (p *llamaAdapter) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				 		shape := t.Shape()
			
 
				 		if (strings.HasSuffix(t.Name(), "weight.lora_a") && shape[0] > shape[1]) ||
			
@@ -41,7 +41,7 @@ func (p *llamaAdapter) Tensors(ts []Tensor) []llm.Tensor {
 
				 			t.SetRepacker(p.repack)
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    shape,
			
--- a/convert/convert_mixtral.go
+++ b/convert/convert_mixtral.go
@@ -6,7 +6,7 @@ import (
 
				 	"slices"
			
 
				 	"strings"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type mixtralModel struct {
			
@@ -15,7 +15,7 @@ type mixtralModel struct {
 
				 	NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
			
 
				 }
			
 
				 
			
 
				-func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *mixtralModel) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.llamaModel.KV(t)
			
 
				 
			
 
				 	if p.NumLocalExperts > 0 {
			
@@ -29,7 +29,7 @@ func (p *mixtralModel) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
			
 
				+func (p *mixtralModel) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				 	oldnew := []string{
			
 
				 		"model.layers", "blk",
			
 
				 		"w1", "ffn_gate_exps",
			
@@ -56,10 +56,10 @@ func (p *mixtralModel) Tensors(ts []Tensor) []llm.Tensor {
 
				 		return true
			
 
				 	})
			
 
				 
			
 
				-	var out []llm.Tensor
			
 
				+	var out []ggml.Tensor
			
 
				 	for n, e := range experts {
			
 
				 		// TODO(mxyng): sanity check experts
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     n,
			
 
				 			Kind:     e[0].Kind(),
			
 
				 			Shape:    append([]uint64{uint64(len(e))}, e[0].Shape()...),
			
--- a/convert/convert_phi3.go
+++ b/convert/convert_phi3.go
@@ -8,7 +8,7 @@ import (
 
				 	"strings"
			
 
				 	"sync"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type phi3Model struct {
			
@@ -37,7 +37,7 @@ type phi3Model struct {
 
				 
			
 
				 var _ ModelConverter = (*phi3Model)(nil)
			
 
				 
			
 
				-func (p *phi3Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (p *phi3Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := p.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "phi3"
			
 
				 	kv["phi3.context_length"] = p.MaxPositionEmbeddings
			
@@ -68,19 +68,19 @@ func (p *phi3Model) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
			
 
				+func (p *phi3Model) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				 	var addRopeFactors sync.Once
			
 
				 
			
 
				-	out := make([]llm.Tensor, 0, len(ts)+2)
			
 
				+	out := make([]ggml.Tensor, 0, len(ts)+2)
			
 
				 	for _, t := range ts {
			
 
				 		if strings.HasPrefix(t.Name(), "blk.0.") {
			
 
				 			addRopeFactors.Do(func() {
			
 
				-				out = append(out, llm.Tensor{
			
 
				+				out = append(out, ggml.Tensor{
			
 
				 					Name:     "rope_factors_long.weight",
			
 
				 					Kind:     0,
			
 
				 					Shape:    []uint64{uint64(len(p.RopeScaling.LongFactor))},
			
 
				 					WriterTo: p.RopeScaling.LongFactor,
			
 
				-				}, llm.Tensor{
			
 
				+				}, ggml.Tensor{
			
 
				 					Name:     "rope_factors_short.weight",
			
 
				 					Kind:     0,
			
 
				 					Shape:    []uint64{uint64(len(p.RopeScaling.ShortFactor))},
			
@@ -89,7 +89,7 @@ func (p *phi3Model) Tensors(ts []Tensor) []llm.Tensor {
 
				 			})
			
 
				 		}
			
 
				 
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_qwen2.go
+++ b/convert/convert_qwen2.go
@@ -1,6 +1,7 @@
 
				 package convert
			
 
				 
			
 
				-import "github.com/ollama/ollama/llm"
			
 
				+import "github.com/ollama/ollama/fs/ggml"
			
 
				+
			
 
				 
			
 
				 type qwen2Model struct {
			
 
				 	ModelParameters
			
@@ -21,7 +22,7 @@ type qwen2Model struct {
 
				 
			
 
				 var _ ModelConverter = (*qwen2Model)(nil)
			
 
				 
			
 
				-func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
			
 
				+func (q *qwen2Model) KV(t *Tokenizer) ggml.KV {
			
 
				 	kv := q.ModelParameters.KV(t)
			
 
				 	kv["general.architecture"] = "qwen2"
			
 
				 	kv["qwen2.block_count"] = q.HiddenLayers
			
@@ -45,10 +46,10 @@ func (q *qwen2Model) KV(t *Tokenizer) llm.KV {
 
				 	return kv
			
 
				 }
			
 
				 
			
 
				-func (q *qwen2Model) Tensors(ts []Tensor) []llm.Tensor {
			
 
				-	var out []llm.Tensor
			
 
				+func (q *qwen2Model) Tensors(ts []Tensor) []ggml.Tensor {
			
 
				+	var out []ggml.Tensor
			
 
				 	for _, t := range ts {
			
 
				-		out = append(out, llm.Tensor{
			
 
				+		out = append(out, ggml.Tensor{
			
 
				 			Name:     t.Name(),
			
 
				 			Kind:     t.Kind(),
			
 
				 			Shape:    t.Shape(),
			
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -20,7 +20,7 @@ import (
 
				 
			
 
				 	"golang.org/x/exp/maps"
			
 
				 
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				+	"github.com/ollama/ollama/fs/ggml"
			
 
				 )
			
 
				 
			
 
				 type tensorData struct {
			
@@ -29,7 +29,7 @@ type tensorData struct {
 
				 	Shape   []int  `json:"shape"`
			
 
				 }
			
 
				 
			
 
				-func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
			
 
				+func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
			
 
				 	t.Helper()
			
 
				 
			
 
				 	f, err := os.CreateTemp(t.TempDir(), "f16")
			
@@ -48,7 +48,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 
				 	}
			
 
				 	t.Cleanup(func() { r.Close() })
			
 
				 
			
 
				-	m, _, err := llm.DecodeGGML(r, math.MaxInt)
			
 
				+	m, _, err := ggml.Decode(r, math.MaxInt)
			
 
				 	if err != nil {
			
 
				 		t.Fatal(err)
			
 
				 	}
			
@@ -60,7 +60,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, llm.KV, *llm.Tensors) {
 
				 	return r, m.KV(), m.Tensors()
			
 
				 }
			
 
				 
			
 
				-func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tensors) map[string]string {
			
 
				+func generateResultsJSON(t *testing.T, f *os.File, kv ggml.KV, tensors ggml.Tensors) map[string]string {
			
 
				 	actual := make(map[string]string)
			
 
				 	for k, v := range kv {
			
 
				 		if s, ok := v.(json.Marshaler); !ok {
			
@@ -75,7 +75,7 @@ func generateResultsJSON(t *testing.T, f *os.File, kv llm.KV, tensors *llm.Tenso
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for _, tensor := range tensors.Items {
			
 
				+	for _, tensor := range tensors.Items() {
			
 
				 		sha256sum := sha256.New()
			
 
				 		sr := io.NewSectionReader(f, int64(tensors.Offset+tensor.Offset), int64(tensor.Size()))
			
 
				 		if _, err := io.Copy(sha256sum, sr); err != nil {
			
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 
				 			}
			
 
				 			defer r.Close()
			
 
				 
			
 
				-			m, _, err := llm.DecodeGGML(r, math.MaxInt)
			
 
				+			m, _, err := ggml.Decode(r, math.MaxInt)
			
 
				 			if err != nil {
			
 
				 				t.Fatal(err)
			
 
				 			}
			
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -718,23 +718,18 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 
				 func LibraryDirs() []string {
			
 
				 	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
			
 
				 	// This can be simplified once we no longer carry runners as payloads
			
 
				-	paths := []string{}
			
 
				-	appExe, err := os.Executable()
			
 
				+	exe, err := os.Executable()
			
 
				 	if err != nil {
			
 
				 		slog.Warn("failed to lookup executable path", "error", err)
			
 
				-	} else {
			
 
				-		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
			
 
				-		if _, err := os.Stat(appRelative); err == nil {
			
 
				-			paths = append(paths, appRelative)
			
 
				-		}
			
 
				+		return nil
			
 
				 	}
			
 
				-	rDir := runners.Locate()
			
 
				-	if err != nil {
			
 
				-		slog.Warn("unable to locate gpu dependency libraries", "error", err)
			
 
				-	} else {
			
 
				-		paths = append(paths, filepath.Dir(rDir))
			
 
				+
			
 
				+	lib := filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama")
			
 
				+	if _, err := os.Stat(lib); err != nil {
			
 
				+		return nil
			
 
				 	}
			
 
				-	return paths
			
 
				+
			
 
				+	return []string{lib}
			
 
				 }
			
 
				 
			
 
				 func GetSystemInfo() SystemInfo {
			
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -1,15 +1,15 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import (
			
 
				 	"encoding/binary"
			
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				+	"log/slog"
			
 
				 	"slices"
			
 
				 	"strings"
			
 
				-	"sync"
			
 
				 
			
 
				-	"github.com/ollama/ollama/util/bufioutil"
			
 
				+	"github.com/ollama/ollama/fs/util/bufioutil"
			
 
				 )
			
 
				 
			
 
				 type GGML struct {
			
@@ -19,145 +19,168 @@ type GGML struct {
 
				 
			
 
				 type model interface {
			
 
				 	KV() KV
			
 
				-	Tensors() *Tensors
			
 
				+	Tensors() Tensors
			
 
				 }
			
 
				 
			
 
				 type KV map[string]any
			
 
				 
			
 
				-func (kv KV) u64(key string) uint64 {
			
 
				-	switch v := kv[key].(type) {
			
 
				-	case uint64:
			
 
				-		return v
			
 
				-	case uint32:
			
 
				-		return uint64(v)
			
 
				-	case float64:
			
 
				-		return uint64(v)
			
 
				-	default:
			
 
				-		return 0
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 func (kv KV) Architecture() string {
			
 
				-	if s, ok := kv["general.architecture"].(string); ok {
			
 
				-		return s
			
 
				-	}
			
 
				-
			
 
				-	return "unknown"
			
 
				+	return kv.String("general.architecture", "unknown")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) Kind() string {
			
 
				-	if s, ok := kv["general.type"].(string); ok {
			
 
				-		return s
			
 
				-	}
			
 
				-
			
 
				-	return "unknown"
			
 
				+	return kv.String("general.type", "unknown")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) ParameterCount() uint64 {
			
 
				-	return kv.u64("general.parameter_count")
			
 
				+	return keyValue[uint64](kv, "general.parameter_count")
			
 
				 }
			
 
				 
			
 
				 func (kv KV) FileType() fileType {
			
 
				-	if u64 := kv.u64("general.file_type"); u64 > 0 {
			
 
				-		return fileType(uint32(u64))
			
 
				+	if t := kv.Uint("general.file_type"); t > 0 {
			
 
				+		return fileType(t)
			
 
				 	}
			
 
				 
			
 
				 	return fileTypeUnknown
			
 
				 }
			
 
				 
			
 
				 func (kv KV) BlockCount() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.block_count", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("block_count"))
			
 
				+}
			
 
				+
			
 
				+func (kv KV) EmbeddingLength() uint64 {
			
 
				+	return uint64(kv.Uint("embedding_length"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) HeadCount() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.attention.head_count", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("attention.head_count"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) HeadCountKV() uint64 {
			
 
				-	if headCountKV := kv.u64(fmt.Sprintf("%s.attention.head_count_kv", kv.Architecture())); headCountKV > 0 {
			
 
				-		return headCountKV
			
 
				-	}
			
 
				-
			
 
				-	return 1
			
 
				+	return uint64(kv.Uint("attention.head_count_kv", 1))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCount() uint64 {
			
 
				 	if heads := kv.HeadCount(); heads > 0 {
			
 
				-		return kv.EmbeddingLength() / kv.HeadCount()
			
 
				+		return kv.EmbeddingLength() / heads
			
 
				 	}
			
 
				 
			
 
				 	return 0
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCountK() uint64 {
			
 
				-	if k := kv.u64(fmt.Sprintf("%s.attention.key_length", kv.Architecture())); k > 0 {
			
 
				-		return k
			
 
				-	}
			
 
				-
			
 
				-	return kv.EmbeddingHeadCount()
			
 
				+	return uint64(kv.Uint("attention.key_length", uint32(kv.EmbeddingHeadCount())))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) EmbeddingHeadCountV() uint64 {
			
 
				-	if v := kv.u64(fmt.Sprintf("%s.attention.value_length", kv.Architecture())); v > 0 {
			
 
				-		return v
			
 
				-	}
			
 
				-
			
 
				-	return kv.EmbeddingHeadCount()
			
 
				+	return uint64(kv.Uint("attention.value_length", uint32(kv.EmbeddingHeadCount())))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) GQA() uint64 {
			
 
				 	return kv.HeadCount() / kv.HeadCountKV()
			
 
				 }
			
 
				 
			
 
				-func (kv KV) EmbeddingLength() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.embedding_length", kv.Architecture()))
			
 
				-}
			
 
				-
			
 
				 func (kv KV) ContextLength() uint64 {
			
 
				-	return kv.u64(fmt.Sprintf("%s.context_length", kv.Architecture()))
			
 
				+	return uint64(kv.Uint("context_length"))
			
 
				 }
			
 
				 
			
 
				 func (kv KV) ChatTemplate() string {
			
 
				-	s, _ := kv["tokenizer.chat_template"].(string)
			
 
				+	return kv.String("tokenizer.chat_template")
			
 
				+}
			
 
				+
			
 
				+func (kv KV) String(key string, defaultValue ...string) string {
			
 
				+	return keyValue(kv, key, append(defaultValue, "")...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Uint(key string, defaultValue ...uint32) uint32 {
			
 
				+	return keyValue(kv, key, append(defaultValue, 0)...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Float(key string, defaultValue ...float32) float32 {
			
 
				+	return keyValue(kv, key, append(defaultValue, 0)...)
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Strings(key string, defaultValue ...[]string) []string {
			
 
				+	r := keyValue(kv, key, &array{})
			
 
				+	s := make([]string, r.size)
			
 
				+	for i := range r.size {
			
 
				+		s[i] = r.values[i].(string)
			
 
				+	}
			
 
				+
			
 
				+	return s
			
 
				+}
			
 
				+
			
 
				+func (kv KV) Uints(key string, defaultValue ...[]uint32) []uint32 {
			
 
				+	r := keyValue(kv, key, &array{})
			
 
				+	s := make([]uint32, r.size)
			
 
				+	for i := range r.size {
			
 
				+		s[i] = uint32(r.values[i].(int32))
			
 
				+	}
			
 
				+
			
 
				 	return s
			
 
				 }
			
 
				 
			
 
				+func keyValue[T string | uint32 | uint64 | float32 | *array](kv KV, key string, defaultValue ...T) T {
			
 
				+	if !strings.HasPrefix(key, "tokenizer.") && !strings.HasPrefix(key, "general.") {
			
 
				+		key = kv.Architecture() + "." + key
			
 
				+	}
			
 
				+
			
 
				+	if val, ok := kv[key]; ok {
			
 
				+		return val.(T)
			
 
				+	}
			
 
				+
			
 
				+	slog.Warn("key not found", "key", key, "default", defaultValue[0])
			
 
				+	return defaultValue[0]
			
 
				+}
			
 
				+
			
 
				 type Tensors struct {
			
 
				-	Items  []*Tensor
			
 
				+	items  []*Tensor
			
 
				 	Offset uint64
			
 
				+}
			
 
				 
			
 
				-	layers     map[string]Layer
			
 
				-	layersOnce sync.Once
			
 
				-}
			
 
				-
			
 
				-func (ts *Tensors) Layers() map[string]Layer {
			
 
				-	ts.layersOnce.Do(func() {
			
 
				-		ts.layers = make(map[string]Layer)
			
 
				-		for _, t := range ts.Items {
			
 
				-			parts := strings.Split(t.Name, ".")
			
 
				-			if index := slices.IndexFunc(parts, func(s string) bool { return s == "blk" || s == "mm" }); index != -1 {
			
 
				-				if len(parts) > index+2 {
			
 
				-					// blk and mm should have a number after them, join it
			
 
				-					parts = append(
			
 
				-						[]string{strings.Join(parts[:index+2], ".")},
			
 
				-						parts[index+2:]...)
			
 
				-				}
			
 
				-			}
			
 
				+func (s Tensors) Items(prefix ...string) []*Tensor {
			
 
				+	if len(prefix) == 0 {
			
 
				+		return s.items
			
 
				+	}
			
 
				 
			
 
				-			if _, ok := ts.layers[parts[0]]; !ok {
			
 
				-				ts.layers[parts[0]] = make(Layer)
			
 
				-			}
			
 
				+	var items []*Tensor
			
 
				+	for _, t := range s.items {
			
 
				+		if strings.HasPrefix(t.Name, prefix[0]) {
			
 
				+			items = append(items, t)
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-			ts.layers[parts[0]][strings.Join(parts[1:], ".")] = t
			
 
				+	return items
			
 
				+}
			
 
				+
			
 
				+func (ts Tensors) Layers() map[string]Layer {
			
 
				+	layers := make(map[string]Layer)
			
 
				+	for _, t := range ts.items {
			
 
				+		parts := strings.Split(t.Name, ".")
			
 
				+		if i := slices.Index(parts, "blk"); i > 0 {
			
 
				+			parts = append([]string{
			
 
				+				strings.Join(parts[:i], "."),
			
 
				+				strings.Join(parts[i:i+2], "."),
			
 
				+			}, parts[i+2:]...)
			
 
				+		} else if i == 0 {
			
 
				+			parts = append([]string{
			
 
				+				strings.Join(parts[i:i+2], "."),
			
 
				+			}, parts[i+2:]...)
			
 
				 		}
			
 
				-	})
			
 
				 
			
 
				-	return ts.layers
			
 
				+		if _, ok := layers[parts[0]]; !ok {
			
 
				+			layers[parts[0]] = make(Layer)
			
 
				+		}
			
 
				+
			
 
				+		layers[parts[0]][strings.Join(parts[1:], ".")] = t
			
 
				+	}
			
 
				+
			
 
				+	return layers
			
 
				 }
			
 
				 
			
 
				 type Layer map[string]*Tensor
			
 
				 
			
 
				-func (l Layer) size() (size uint64) {
			
 
				+func (l Layer) Size() (size uint64) {
			
 
				 	for _, t := range l {
			
 
				 		size += t.Size()
			
 
				 	}
			
@@ -255,8 +278,6 @@ func (t Tensor) typeSize() uint64 {
 
				 		return 8
			
 
				 	case 29: // IQ1_M
			
 
				 		return blockSize/8 + blockSize/16 + blockSize/32
			
 
				-	case 30: // BF16
			
 
				-		return 2
			
 
				 	default:
			
 
				 		return 0
			
 
				 	}
			
@@ -295,7 +316,7 @@ const (
 
				 
			
 
				 var ErrUnsupportedFormat = errors.New("unsupported model format")
			
 
				 
			
 
				-func DetectGGMLType(b []byte) string {
			
 
				+func DetectContentType(b []byte) string {
			
 
				 	switch binary.LittleEndian.Uint32(b[:4]) {
			
 
				 	case FILE_MAGIC_GGML:
			
 
				 		return "ggml"
			
@@ -312,12 +333,12 @@ func DetectGGMLType(b []byte) string {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// DecodeGGML decodes a GGML model from the given reader.
			
 
				+// Decode decodes a GGML model from the given reader.
			
 
				 //
			
 
				 // It collects array values for arrays with a size less than or equal to
			
 
				 // maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
			
 
				 // the maxArraySize is negative, all arrays are collected.
			
 
				-func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
			
 
				+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
			
 
				 	if maxArraySize == 0 {
			
 
				 		maxArraySize = 1024
			
 
				 	}
			
@@ -331,10 +352,6 @@ func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 
				 
			
 
				 	var c container
			
 
				 	switch magic {
			
 
				-	case FILE_MAGIC_GGML, FILE_MAGIC_GGMF, FILE_MAGIC_GGJT:
			
 
				-		return nil, 0, ErrUnsupportedFormat
			
 
				-	case FILE_MAGIC_GGLA:
			
 
				-		c = &containerGGLA{}
			
 
				 	case FILE_MAGIC_GGUF_LE:
			
 
				 		c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
			
 
				 	case FILE_MAGIC_GGUF_BE:
			
@@ -530,21 +547,20 @@ func (llm GGML) GraphSize(context, batch uint64, kvCacheType string) (kv, partia
 
				 }
			
 
				 
			
 
				 // SupportsKVCacheType checks if the requested cache type is supported
			
 
				-func (ggml GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				-	validKVCacheTypes := []string{"f16", "q8_0", "q4_0"}
			
 
				-	return slices.Contains(validKVCacheTypes, cacheType)
			
 
				+func (llm GGML) SupportsKVCacheType(cacheType string) bool {
			
 
				+	return slices.Contains([]string{"f16", "q8_0", "q4_0"}, cacheType)
			
 
				 }
			
 
				 
			
 
				 // SupportsFlashAttention checks if the model supports flash attention
			
 
				-func (ggml GGML) SupportsFlashAttention() bool {
			
 
				-	_, isEmbedding := ggml.KV()[fmt.Sprintf("%s.pooling_type", ggml.KV().Architecture())]
			
 
				+func (llm GGML) SupportsFlashAttention() bool {
			
 
				+	_, isEmbedding := llm.KV()[fmt.Sprintf("%s.pooling_type", llm.KV().Architecture())]
			
 
				 	if isEmbedding {
			
 
				 		return false
			
 
				 	}
			
 
				 
			
 
				 	// Check head counts match and are non-zero
			
 
				-	headCountK := ggml.KV().EmbeddingHeadCountK()
			
 
				-	headCountV := ggml.KV().EmbeddingHeadCountV()
			
 
				+	headCountK := llm.KV().EmbeddingHeadCountK()
			
 
				+	headCountV := llm.KV().EmbeddingHeadCountV()
			
 
				 	return headCountK != 0 && headCountV != 0 && headCountK == headCountV
			
 
				 }
			
 
				 
			
--- a/fs/ggml/gguf.go
+++ b/fs/ggml/gguf.go
@@ -1,4 +1,4 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import (
			
 
				 	"bytes"
			
@@ -8,10 +8,9 @@ import (
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"log/slog"
			
 
				+	"maps"
			
 
				 	"slices"
			
 
				 	"strings"
			
 
				-
			
 
				-	"golang.org/x/exp/maps"
			
 
				 )
			
 
				 
			
 
				 type containerGGUF struct {
			
@@ -110,9 +109,9 @@ func (llm *gguf) KV() KV {
 
				 	return llm.kv
			
 
				 }
			
 
				 
			
 
				-func (llm *gguf) Tensors() *Tensors {
			
 
				-	return &Tensors{
			
 
				-		Items:  llm.tensors,
			
 
				+func (llm *gguf) Tensors() Tensors {
			
 
				+	return Tensors{
			
 
				+		items:  llm.tensors,
			
 
				 		Offset: llm.tensorOffset,
			
 
				 	}
			
 
				 }
			
@@ -523,7 +522,7 @@ func WriteGGUF(ws io.WriteSeeker, kv KV, ts []Tensor) error {
 
				 		return err
			
 
				 	}
			
 
				 
			
 
				-	keys := maps.Keys(kv)
			
 
				+	keys := slices.Collect(maps.Keys(kv))
			
 
				 	slices.Sort(keys)
			
 
				 
			
 
				 	for _, key := range keys {
			
--- a/fs/ggml/type.go
+++ b/fs/ggml/type.go
@@ -1,4 +1,4 @@
 
				-package llm
			
 
				+package ggml
			
 
				 
			
 
				 import "fmt"
			
 
				 
			
@@ -32,10 +32,9 @@ const (
 
				 	fileTypeIQ1_S
			
 
				 	fileTypeIQ4_NL
			
 
				 	fileTypeIQ3_S
			
 
				-	fileTypeIQ3_M
			
 
				 	fileTypeIQ2_S
			
 
				-	fileTypeIQ2_M
			
 
				 	fileTypeIQ4_XS
			
 
				+	fileTypeIQ2_M
			
 
				 	fileTypeIQ1_M
			
 
				 	fileTypeBF16
			
 
				 
			
@@ -94,8 +93,6 @@ func ParseFileType(s string) (fileType, error) {
 
				 		return fileTypeIQ4_NL, nil
			
 
				 	case "IQ3_S":
			
 
				 		return fileTypeIQ3_S, nil
			
 
				-	case "IQ3_M":
			
 
				-		return fileTypeIQ3_M, nil
			
 
				 	case "IQ2_S":
			
 
				 		return fileTypeIQ2_S, nil
			
 
				 	case "IQ4_XS":
			
@@ -163,8 +160,6 @@ func (t fileType) String() string {
 
				 		return "IQ4_NL"
			
 
				 	case fileTypeIQ3_S:
			
 
				 		return "IQ3_S"
			
 
				-	case fileTypeIQ3_M:
			
 
				-		return "IQ3_M"
			
 
				 	case fileTypeIQ2_S:
			
 
				 		return "IQ2_S"
			
 
				 	case fileTypeIQ4_XS:
			
--- a/fs/util/bufioutil/buffer_seeker.go
+++ b/fs/util/bufioutil/buffer_seeker.go
--- a/fs/util/bufioutil/buffer_seeker_test.go
+++ b/fs/util/bufioutil/buffer_seeker_test.go
--- a/go.mod
+++ b/go.mod
@@ -17,12 +17,14 @@ require (
 
				 require (
			
 
				 	github.com/agnivade/levenshtein v1.1.1
			
 
				 	github.com/d4l3k/go-bfloat16 v0.0.0-20211005043715-690c3bdd05f1
			
 
				+	github.com/dlclark/regexp2 v1.11.4
			
 
				 	github.com/emirpasic/gods/v2 v2.0.0-alpha
			
 
				 	github.com/google/go-cmp v0.6.0
			
 
				 	github.com/mattn/go-runewidth v0.0.14
			
 
				 	github.com/nlpodyssey/gopickle v0.3.0
			
 
				 	github.com/pdevine/tensor v0.0.0-20240510204454-f88f4562727c
			
 
				 	golang.org/x/image v0.22.0
			
 
				+	gonum.org/v1/gonum v0.15.0
			
 
				 )
			
 
				 
			
 
				 require (
			
@@ -42,7 +44,6 @@ require (
 
				 	github.com/xtgo/set v1.0.0 // indirect
			
 
				 	go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
			
 
				 	golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect
			
 
				-	gonum.org/v1/gonum v0.15.0 // indirect
			
 
				 	gorgonia.org/vecf32 v0.9.0 // indirect
			
 
				 	gorgonia.org/vecf64 v0.9.0 // indirect
			
 
				 )
			
--- a/go.sum
+++ b/go.sum
@@ -42,6 +42,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 
				 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
			
 
				 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g=
			
 
				 github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA=
			
 
				+github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
			
 
				+github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
			
 
				 github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
			
 
				 github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
			
 
				 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
			
--- a/llama/README.md
+++ b/llama/README.md
@@ -37,8 +37,7 @@ go build -tags avx .
 
				 ```shell
			
 
				 # go doesn't recognize `-mfma` as a valid compiler flag
			
 
				 # see https://github.com/golang/go/issues/17895
			
 
				-go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
			
 
				-go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
			
 
				+go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
			
 
				 go build -tags=avx,avx2 .
			
 
				 ```
			
 
				 
			
--- a/llama/amx.h
+++ b/llama/amx.h
@@ -1,34 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "ggml-backend.h"
			
 
				-#include "ggml-cpu-impl.h"
			
 
				-
			
 
				-// GGML internal header
			
 
				-
			
 
				-#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
			
 
				-ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
			
 
				-#endif
			
--- a/llama/ggml-blas.h
+++ b/llama/ggml-blas.h
@@ -1,51 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#pragma once
			
 
				-
			
 
				-#include "ggml.h"
			
 
				-#include "ggml-backend.h"
			
 
				-
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// backend API
			
 
				-GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
			
 
				-
			
 
				-GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
			
 
				-
			
 
				-// number of threads used for conversion to float
			
 
				-// for openblas and blis, this will also set the number of threads used for blas operations
			
 
				-GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
			
 
				-
			
 
				-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
			
 
				-
			
 
				-
			
 
				-#ifdef  __cplusplus
			
 
				-}
			
 
				-#endif
			
--- a/llama/ggml-cpu-aarch64.h
+++ b/llama/ggml-cpu-aarch64.h
@@ -1,34 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#pragma once
			
 
				-
			
 
				-#include "ggml-cpu-traits.h"
			
 
				-#include "ggml.h"
			
 
				-
			
 
				-// GGML internal header
			
 
				-
			
 
				-ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
			
--- a/llama/ggml-cpu-traits.h
+++ b/llama/ggml-cpu-traits.h
@@ -1,64 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#pragma once
			
 
				-#include "ggml-backend-impl.h"
			
 
				-#include "ggml-cpu-impl.h"
			
 
				-#include "ggml.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-#    include <vector>
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-
			
 
				-// return true if op part of extra "accelerator"
			
 
				-bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op);
			
 
				-bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}
			
 
				-
			
 
				-namespace ggml::cpu {
			
 
				-// register in tensor->extra
			
 
				-class tensor_traits {
			
 
				-  public:
			
 
				-    virtual ~tensor_traits();
			
 
				-    virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size)        = 0;
			
 
				-    virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
			
 
				-};
			
 
				-
			
 
				-class extra_buffer_type {
			
 
				-  public:
			
 
				-    virtual ~extra_buffer_type();
			
 
				-    virtual bool            supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
			
 
				-    virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op)                   = 0;
			
 
				-};
			
 
				-}  // namespace ggml::cpu
			
 
				-
			
 
				-// implemented in ggml-cpu.cpp.
			
 
				-std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();
			
 
				-
			
 
				-#endif
			
--- a/llama/ggml-cuda/acc.cuh
+++ b/llama/ggml-cuda/acc.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ACC_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/arange.cu
+++ b/llama/ggml-cuda/arange.cu
@@ -1,60 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "arange.cuh"
			
 
				-
			
 
				-static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) {
			
 
				-    // blockIDx.x: idx of ne0 / BLOCK_SIZE
			
 
				-    int nidx = threadIdx.x + blockIdx.x * blockDim.x;
			
 
				-    if (nidx >= ne0) {
			
 
				-        return;
			
 
				-    }
			
 
				-    dst[nidx] = start + step * nidx;
			
 
				-}
			
 
				-
			
 
				-static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) {
			
 
				-    int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE;
			
 
				-    arange_f32<<<num_blocks, CUDA_ARANGE_BLOCK_SIZE, 0, stream>>>(dst, ne0, start,  step);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    float start;
			
 
				-    float stop;
			
 
				-    float step;
			
 
				-    memcpy(&start, (float *)dst->op_params + 0, sizeof(float));
			
 
				-    memcpy(&stop,  (float *)dst->op_params + 1, sizeof(float));
			
 
				-    memcpy(&step,  (float *)dst->op_params + 2, sizeof(float));
			
 
				-
			
 
				-    int64_t steps = (int64_t)ceil((stop - start) / step);
			
 
				-    GGML_ASSERT(ggml_nelements(dst) == steps);
			
 
				-
			
 
				-    arange_f32_cuda(dst_d, dst->ne[0], start, step, stream);
			
 
				-}
			
--- a/llama/ggml-cuda/arange.cuh
+++ b/llama/ggml-cuda/arange.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ARANGE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/argmax.cuh
+++ b/llama/ggml-cuda/argmax.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/argsort.cuh
+++ b/llama/ggml-cuda/argsort.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/binbcast.cuh
+++ b/llama/ggml-cuda/binbcast.cuh
@@ -1,35 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/clamp.cu
+++ b/llama/ggml-cuda/clamp.cu
@@ -1,60 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "clamp.cuh"
			
 
				-
			
 
				-static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
			
 
				-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				-
			
 
				-    if (i >= k) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
			
 
				-}
			
 
				-
			
 
				-static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
			
 
				-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
			
 
				-    clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    float min;
			
 
				-    float max;
			
 
				-    memcpy(&min, dst->op_params, sizeof(float));
			
 
				-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
			
 
				-
			
 
				-    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
			
 
				-}
			
--- a/llama/ggml-cuda/clamp.cuh
+++ b/llama/ggml-cuda/clamp.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CLAMP_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/concat.cuh
+++ b/llama/ggml-cuda/concat.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CONCAT_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/conv-transpose-1d.cuh
+++ b/llama/ggml-cuda/conv-transpose-1d.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CONV_TRANPOSE_1D_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/convert.cuh
+++ b/llama/ggml-cuda/convert.cuh
@@ -1,39 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
			
 
				-
			
 
				-template<typename T>
			
 
				-using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream);
			
 
				-
			
 
				-typedef to_t_cuda_t<float> to_fp32_cuda_t;
			
 
				-typedef to_t_cuda_t<half> to_fp16_cuda_t;
			
 
				-
			
 
				-to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type);
			
 
				-
			
 
				-to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
			
--- a/llama/ggml-cuda/count-equal.cuh
+++ b/llama/ggml-cuda/count-equal.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
			
 
				-
			
 
				-void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/cpy.cuh
+++ b/llama/ggml-cuda/cpy.cuh
@@ -1,35 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CPY_BLOCK_SIZE 64
			
 
				-
			
 
				-void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
			
 
				-
			
 
				-void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
			
--- a/llama/ggml-cuda/cross-entropy-loss.cuh
+++ b/llama/ggml-cuda/cross-entropy-loss.cuh
@@ -1,33 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/diagmask.cuh
+++ b/llama/ggml-cuda/diagmask.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
			
 
				-
			
 
				-void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/fattn-tile-f16.cuh
+++ b/llama/ggml-cuda/fattn-tile-f16.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/fattn-tile-f32.cuh
+++ b/llama/ggml-cuda/fattn-tile-f32.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/fattn.cuh
+++ b/llama/ggml-cuda/fattn.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/getrows.cuh
+++ b/llama/ggml-cuda/getrows.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_GET_ROWS_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/im2col.cuh
+++ b/llama/ggml-cuda/im2col.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_IM2COL_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/mmv.cuh
+++ b/llama/ggml-cuda/mmv.cuh
@@ -1,38 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
			
 
				-#define MMV_MAX_ROWS 512
			
 
				-
			
 
				-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_mul_mat_vec(
			
 
				-    ggml_backend_cuda_context & ctx,
			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);
			
--- a/llama/ggml-cuda/mmvq.cuh
+++ b/llama/ggml-cuda/mmvq.cuh
@@ -1,35 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
			
 
				-
			
 
				-void ggml_cuda_op_mul_mat_vec_q(
			
 
				-    ggml_backend_cuda_context & ctx,
			
 
				-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
			
 
				-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
			
 
				-    const int64_t src1_padded_row_size, cudaStream_t stream);
			
--- a/llama/ggml-cuda/norm.cuh
+++ b/llama/ggml-cuda/norm.cuh
@@ -1,33 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-
			
 
				-void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/opt-step-adamw.cuh
+++ b/llama/ggml-cuda/opt-step-adamw.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_OPT_STEP_ADAMW_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/out-prod.cuh
+++ b/llama/ggml-cuda/out-prod.cuh
@@ -1,29 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/pad.cuh
+++ b/llama/ggml-cuda/pad.cuh
@@ -1,32 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_PAD_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
 
				-void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/pool2d.cuh
+++ b/llama/ggml-cuda/pool2d.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_POOL2D_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/quantize.cuh
+++ b/llama/ggml-cuda/quantize.cuh
@@ -1,50 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#pragma once
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-#include "mmq.cuh"
			
 
				-
			
 
				-#include <cstdint>
			
 
				-
			
 
				-#define CUDA_QUANTIZE_BLOCK_SIZE     256
			
 
				-#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
			
 
				-
			
 
				-static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
			
 
				-static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
			
 
				-
			
 
				-typedef void (*quantize_cuda_t)(
			
 
				-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				-    const ggml_type type_x, cudaStream_t stream);
			
 
				-
			
 
				-void quantize_row_q8_1_cuda(
			
 
				-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				-    const ggml_type type_x, cudaStream_t stream);
			
 
				-
			
 
				-void quantize_mmq_q8_1_cuda(
			
 
				-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
			
 
				-    const ggml_type type_x, cudaStream_t stream);
			
--- a/llama/ggml-cuda/rope.cuh
+++ b/llama/ggml-cuda/rope.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_ROPE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/scale.cu
+++ b/llama/ggml-cuda/scale.cu
@@ -1,57 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "scale.cuh"
			
 
				-
			
 
				-static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) {
			
 
				-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
			
 
				-
			
 
				-    if (i >= k) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    dst[i] = scale * x[i];
			
 
				-}
			
 
				-
			
 
				-static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
			
 
				-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
			
 
				-    scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-
			
 
				-    float scale;
			
 
				-    memcpy(&scale, dst->op_params, sizeof(float));
			
 
				-
			
 
				-    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
			
 
				-}
			
--- a/llama/ggml-cuda/scale.cuh
+++ b/llama/ggml-cuda/scale.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_SCALE_BLOCK_SIZE 256
			
 
				-
			
 
				-void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/softmax.cuh
+++ b/llama/ggml-cuda/softmax.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
			
 
				-
			
 
				-void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/sum.cuh
+++ b/llama/ggml-cuda/sum.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
			
 
				-
			
 
				-void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/sumrows.cu
+++ b/llama/ggml-cuda/sumrows.cu
@@ -1,65 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "sumrows.cuh"
			
 
				-
			
 
				-static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) {
			
 
				-    const int row = blockIdx.x;
			
 
				-    const int col = threadIdx.x;
			
 
				-
			
 
				-    float sum = 0.0f;
			
 
				-    for (int i = col; i < ncols; i += blockDim.x) {
			
 
				-        sum += x[row * ncols + i];
			
 
				-    }
			
 
				-
			
 
				-    sum = warp_reduce_sum(sum);
			
 
				-
			
 
				-    if (col == 0) {
			
 
				-        dst[row] = sum;
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				-    const dim3 block_nums(nrows, 1, 1);
			
 
				-    k_sum_rows_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				-}
			
 
				-
			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				-    const ggml_tensor * src0 = dst->src[0];
			
 
				-    const float * src0_d = (const float *)src0->data;
			
 
				-    float * dst_d = (float *)dst->data;
			
 
				-    cudaStream_t stream = ctx.stream();
			
 
				-
			
 
				-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
			
 
				-    GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				-
			
 
				-    const int64_t ncols = src0->ne[0];
			
 
				-    const int64_t nrows = ggml_nrows(src0);
			
 
				-
			
 
				-    sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream);
			
 
				-}
			
--- a/llama/ggml-cuda/sumrows.cuh
+++ b/llama/ggml-cuda/sumrows.cuh
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-#include "common.cuh"
			
 
				-
			
 
				-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream);
			
 
				-
			
 
				-void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_F16, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_0, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q4_1, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_0, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q5_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_F16);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q4_1);
			
--- a/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/llama/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -1,31 +0,0 @@
 
				-/**
			
 
				- * llama.cpp - commit 46e3556e01b824e52395fb050b29804b6cff2a7c - do not edit this file
			
 
				- *
			
 
				- * MIT License
			
 
				- *
			
 
				- * Copyright (c) 2023-2024 The ggml authors
			
 
				- *
			
 
				- * Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				- * of this software and associated documentation files (the "Software"), to deal
			
 
				- * in the Software without restriction, including without limitation the rights
			
 
				- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				- * copies of the Software, and to permit persons to whom the Software is
			
 
				- * furnished to do so, subject to the following conditions:
			
 
				- *
			
 
				- * The above copyright notice and this permission notice shall be included in all
			
 
				- * copies or substantial portions of the Software.
			
 
				- *
			
 
				- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				- * SOFTWARE.
			
 
				- */
			
 
				-
			
 
				-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
			
 
				-
			
 
				-#include "../fattn-vec-f16.cuh"
			
 
				-
			
 
				-DECL_FATTN_VEC_F16_CASE(128, GGML_TYPE_Q8_0, GGML_TYPE_Q5_0);