11 月之前 · e9d15eb277
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -1,146 +1,150 @@
 
				-OS := $(shell uname -s)

			
 
				-ARCH := $(or $(ARCH), $(shell uname -m))

			
 
				-NVCC := nvcc

			
 
				-HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")

			
 
				-HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"

			
 
				-

			
 
				-ifeq ($(ARCH),x86_64)

			
 
				-    ARCH := amd64

			
 
				-endif

			
 
				-

			
 
				-# Determine object file extension based on OS

			
 
				-ifneq (,$(findstring MINGW,$(OS)))

			
 
				-    OBJ_EXT := obj

			
 
				-	SHARED_EXT := dll

			
 
				-else

			
 
				-    OBJ_EXT := o

			
 
				-	SHARED_EXT := so

			
 
				-endif

			
 
				-

			
 
				-CUDA_SRCS := \

			
 
				-    ggml-cuda.cu \

			
 
				-    $(wildcard ggml-cuda/*.cu) \

			
 
				-    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \

			
 
				-    $(wildcard ggml-cuda/template-instances/mmq*.cu) \

			
 
				-    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \

			
 
				-    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \

			
 
				-    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \

			
 
				-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp

			
 
				-

			
 
				-CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))

			
 
				-CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))

			
 
				-CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))

			
 
				-

			
 
				-HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))

			
 
				-HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))

			
 
				-HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))

			
 
				-

			
 
				-CUDA_FLAGS := \

			
 
				-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \

			
 
				-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \

			
 
				-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \

			
 
				-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \

			
 
				-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \

			
 
				-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \

			
 
				-    -DGGML_CUDA_DMMV_X=32 \

			
 
				-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \

			
 
				-    -DGGML_USE_CUDA=1 \

			
 
				-    -DGGML_SHARED=1 \

			
 
				-    -DGGML_BUILD=1 \

			
 
				-    -DGGML_USE_LLAMAFILE \

			
 
				-    -D_GNU_SOURCE \

			
 
				-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \

			
 
				-    -Wno-deprecated-gpu-targets \

			
 
				-    --forward-unknown-to-host-compiler \

			
 
				-    -use_fast_math \

			
 
				-    -link \

			
 
				-    -shared \

			
 
				-    -I. \

			
 
				-    -O3

			
 
				-

			
 
				-HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102

			
 
				-LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-

			
 
				-

			
 
				-HIP_FLAGS := \

			
 
				-    -c \

			
 
				-	-O3 \

			
 
				-	-DGGML_USE_CUDA \

			
 
				-	-DGGML_BUILD=1 \

			
 
				-	-DGGML_SHARED=1 \

			
 
				-    -DGGML_CUDA_DMMV_X=32 \

			
 
				-	-DGGML_CUDA_MMV_Y=1 \

			
 
				-	-DGGML_SCHED_MAX_COPIES=4 \

			
 
				-    -DGGML_USE_HIPBLAS \

			
 
				-	-DGGML_USE_LLAMAFILE \

			
 
				-	-DHIP_FAST_MATH \

			
 
				-	-DNDEBUG \

			
 
				-    -DK_QUANTS_PER_ITERATION=2 \

			
 
				-	-D_CRT_SECURE_NO_WARNINGS \

			
 
				-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \

			
 
				-	-D_GNU_SOURCE \

			
 
				-    -Wno-expansion-to-defined \

			
 
				-	-Wno-invalid-noreturn \

			
 
				-	-Wno-ignored-attributes \

			
 
				-    -Wno-pass-failed \

			
 
				-	-Wno-deprecated-declarations \

			
 
				-	-Wno-unused-result \

			
 
				-	-I. \

			
 
				-    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))

			
 
				-

			
 
				-ifeq ($(UNAME_S), Linux)

			
 
				-    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))

			
 
				-endif

			
 
				-

			
 
				-ifeq ($(OS),Darwin)

			
 
				-    ifeq ($(ARCH),arm64)

			
 
				-        all: ollama_runner

			
 
				-    else ifeq ($(ARCH),amd64)

			
 
				-        all: ollama_runner ollama_runner_avx ollama_runner_avx2

			
 
				-    endif

			
 
				-else

			
 
				-    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm

			
 
				-endif

			
 
				-

			
 
				-%.cuda.$(OBJ_EXT): %.cu

			
 
				-	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<

			
 
				-

			
 
				-%.cuda.$(OBJ_EXT): %.c

			
 
				-	$(NVCC) -c $(CFLAGS) -o $@ $<

			
 
				-

			
 
				-%.cuda.$(OBJ_EXT): %.cpp

			
 
				-	$(NVCC) -c $(CXXFLAGS) -o $@ $<

			
 
				-

			
 
				-ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)

			
 
				-	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@

			
 
				-

			
 
				-%.hip.$(OBJ_EXT): %.cu

			
 
				-	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<

			
 
				-

			
 
				-%.hip.$(OBJ_EXT): %.c

			
 
				-	$(HIPCC) -c $(CFLAGS) -o $@ $<

			
 
				-

			
 
				-%.hip.$(OBJ_EXT): %.cpp

			
 
				-	$(HIPCC) -c $(CXXFLAGS) -o $@ $<

			
 
				-

			
 
				-ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)

			
 
				-	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@

			
 
				-

			
 
				-ollama_runner:

			
 
				-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 

			
 
				-

			
 
				-ollama_runner_avx:

			
 
				-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner

			
 
				-

			
 
				-ollama_runner_avx2:

			
 
				-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner

			
 
				-

			
 
				-ollama_runner_cuda: ggml_cuda.dll

			
 
				-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner

			
 
				-

			
 
				-ollama_runner_rocm: ggml_hipblas.dll

			
 
				-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner

			
 
				-

			
 
				-clean:

			
 
				-	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*

			
 
				-

			
 
				-.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm

			
 
				+OS := $(shell uname -s)
			
 
				+ARCH := $(or $(ARCH), $(shell uname -m))
			
 
				+NVCC := nvcc
			
 
				+
			
 
				+export CGO_CFLAGS_ALLOW = -mfma|-mf16c
			
 
				+export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
			
 
				+
			
 
				+ifeq ($(ARCH),x86_64)
			
 
				+    ARCH := amd64
			
 
				+endif
			
 
				+
			
 
				+ifneq (,$(findstring MINGW,$(OS)))
			
 
				+    OBJ_EXT := obj
			
 
				+	SHARED_EXT := dll
			
 
				+    HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")
			
 
				+else
			
 
				+    OBJ_EXT := o
			
 
				+	SHARED_EXT := so
			
 
				+endif
			
 
				+
			
 
				+CUDA_SRCS := \
			
 
				+    ggml-cuda.cu \
			
 
				+    $(wildcard ggml-cuda/*.cu) \
			
 
				+    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
			
 
				+    $(wildcard ggml-cuda/template-instances/mmq*.cu) \
			
 
				+    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
			
 
				+    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
			
 
				+    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
			
 
				+    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
			
 
				+
			
 
				+CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
			
 
				+CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
			
 
				+CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))
			
 
				+
			
 
				+HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
			
 
				+HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
			
 
				+HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))
			
 
				+
			
 
				+CUDA_FLAGS := \
			
 
				+    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
			
 
				+    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
			
 
				+    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
			
 
				+    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
			
 
				+    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
			
 
				+    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
			
 
				+    -DGGML_CUDA_DMMV_X=32 \
			
 
				+    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
			
 
				+    -DGGML_USE_CUDA=1 \
			
 
				+    -DGGML_SHARED=1 \
			
 
				+    -DGGML_BUILD=1 \
			
 
				+    -DGGML_USE_LLAMAFILE \
			
 
				+    -D_GNU_SOURCE \
			
 
				+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				+    -Wno-deprecated-gpu-targets \
			
 
				+    --forward-unknown-to-host-compiler \
			
 
				+    -use_fast_math \
			
 
				+    -link \
			
 
				+    -shared \
			
 
				+    -I. \
			
 
				+    -O3
			
 
				+
			
 
				+HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
			
 
				+LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
			
 
				+
			
 
				+HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"
			
 
				+HIP_FLAGS := \
			
 
				+    -c \
			
 
				+	-O3 \
			
 
				+	-DGGML_USE_CUDA \
			
 
				+	-DGGML_BUILD=1 \
			
 
				+	-DGGML_SHARED=1 \
			
 
				+    -DGGML_CUDA_DMMV_X=32 \
			
 
				+	-DGGML_CUDA_MMV_Y=1 \
			
 
				+	-DGGML_SCHED_MAX_COPIES=4 \
			
 
				+    -DGGML_USE_HIPBLAS \
			
 
				+	-DGGML_USE_LLAMAFILE \
			
 
				+	-DHIP_FAST_MATH \
			
 
				+	-DNDEBUG \
			
 
				+    -DK_QUANTS_PER_ITERATION=2 \
			
 
				+	-D_CRT_SECURE_NO_WARNINGS \
			
 
				+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				+	-D_GNU_SOURCE \
			
 
				+    -Wno-expansion-to-defined \
			
 
				+	-Wno-invalid-noreturn \
			
 
				+	-Wno-ignored-attributes \
			
 
				+    -Wno-pass-failed \
			
 
				+	-Wno-deprecated-declarations \
			
 
				+	-Wno-unused-result \
			
 
				+    -Xclang \
			
 
				+    --dependent-lib=msvcrt \
			
 
				+	-I. \
			
 
				+    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
			
 
				+
			
 
				+ifeq ($(UNAME_S), Linux)
			
 
				+    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))
			
 
				+endif
			
 
				+
			
 
				+ifeq ($(OS),Darwin)
			
 
				+    ifeq ($(ARCH),arm64)
			
 
				+        all: ollama_runner
			
 
				+    else ifeq ($(ARCH),amd64)
			
 
				+        all: ollama_runner ollama_runner_avx ollama_runner_avx2
			
 
				+    endif
			
 
				+else
			
 
				+    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
			
 
				+endif
			
 
				+
			
 
				+%.cuda.$(OBJ_EXT): %.cu
			
 
				+	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
			
 
				+
			
 
				+%.cuda.$(OBJ_EXT): %.c
			
 
				+	$(NVCC) -c $(CFLAGS) -o $@ $<
			
 
				+
			
 
				+%.cuda.$(OBJ_EXT): %.cpp
			
 
				+	$(NVCC) -c $(CXXFLAGS) -o $@ $<
			
 
				+
			
 
				+ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
			
 
				+	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
			
 
				+
			
 
				+%.hip.$(OBJ_EXT): %.cu
			
 
				+	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
			
 
				+
			
 
				+%.hip.$(OBJ_EXT): %.c
			
 
				+	$(HIPCC) -c $(CFLAGS) -o $@ $<
			
 
				+
			
 
				+%.hip.$(OBJ_EXT): %.cpp
			
 
				+	$(HIPCC) -c $(CXXFLAGS) -o $@ $<
			
 
				+
			
 
				+ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
			
 
				+	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
			
 
				+
			
 
				+ollama_runner:
			
 
				+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 
			
 
				+
			
 
				+ollama_runner_avx:
			
 
				+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
			
 
				+
			
 
				+ollama_runner_avx2:
			
 
				+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
			
 
				+
			
 
				+ollama_runner_cuda: ggml_cuda.dll
			
 
				+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
			
 
				+
			
 
				+ollama_runner_rocm: ggml_hipblas.dll
			
 
				+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
			
 
				+
			
 
				+clean:
			
 
				+	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*
			
 
				+
			
 
				+.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
			
--- a/llama/README.md
+++ b/llama/README.md
@@ -13,11 +13,6 @@ Supported:
 
				 - [x] Linux ROCm
			
 
				 - [x] Llava
			
 
				 
			
 
				-Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these small dlls are created:
			
 
				-
			
 
				-- `ggml-cuda.dll`
			
 
				-- `ggml-hipblas.dll`
			
 
				-
			
 
				 > Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
			
 
				 
			
 
				 ## Building
			
@@ -46,11 +41,7 @@ go build -tags=avx,avx2 .
 
				 
			
 
				 ### CUDA
			
 
				 
			
 
				-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build `libggml-cuda.so`:
			
 
				-
			
 
				-```shell
			
 
				-./build_cuda.sh
			
 
				-```
			
 
				+Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
			
 
				 
			
 
				 Then build the package with the `cuda` tag:
			
 
				 
			
@@ -69,7 +60,7 @@ Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-down
 
				 Build `ggml-cuda.dll`:
			
 
				 
			
 
				 ```shell
			
 
				-./build_cuda.ps1
			
 
				+make ggml_cuda.dll
			
 
				 ```
			
 
				 
			
 
				 Then build the package with the `cuda` tag:
			
@@ -82,10 +73,8 @@ go build -tags=cuda .
 
				 
			
 
				 Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) and [Strawberry Perl](https://strawberryperl.com/).
			
 
				 
			
 
				-Then, build `ggml-hipblas.dll`:
			
 
				-
			
 
				 ```shell
			
 
				-./build_hipblas.sh
			
 
				+make ggml_hipblas.dll
			
 
				 ```
			
 
				 
			
 
				 Then build the package with the `rocm` tag:
			
--- a/llama/build_cuda.sh
+++ b/llama/build_cuda.sh
@@ -1,47 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-os="$(uname -s)"
			
 
				-
			
 
				-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
			
 
				-    output="ggml-cuda.dll"
			
 
				-else
			
 
				-    output="libggml-cuda.so"
			
 
				-fi
			
 
				-
			
 
				-nvcc \
			
 
				-    -t $(nproc) \
			
 
				-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
			
 
				-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
			
 
				-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
			
 
				-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
			
 
				-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
			
 
				-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
			
 
				-    -DGGML_CUDA_DMMV_X=32 \
			
 
				-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
			
 
				-    -DGGML_CUDA_MMV_Y=1 \
			
 
				-    -DGGML_USE_CUDA=1 \
			
 
				-    -DGGML_SHARED=1 \
			
 
				-    -DGGML_BUILD=1 \
			
 
				-    -DGGML_USE_LLAMAFILE \
			
 
				-    -D_GNU_SOURCE \
			
 
				-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				-    -Wno-deprecated-gpu-targets \
			
 
				-    --forward-unknown-to-host-compiler \
			
 
				-    -use_fast_math \
			
 
				-    -link \
			
 
				-    -shared \
			
 
				-    -I. \
			
 
				-    -lcuda -lcublas -lcudart -lcublasLt \
			
 
				-    -O3 \
			
 
				-    -o $output \
			
 
				-    ggml-cuda.cu \
			
 
				-    ggml-cuda/*.cu \
			
 
				-    ggml-cuda/template-instances/fattn-wmma*.cu \
			
 
				-    ggml-cuda/template-instances/mmq*.cu \
			
 
				-    ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \
			
 
				-    ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \
			
 
				-    ggml-cuda/template-instances/fattn-vec*f16-f16.cu \
			
 
				-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
			
 
				-
			
 
				-#   -DGGML_CUDA_USE_GRAPHS=1 
			
 
				-#   -DGGML_CUDA_FA_ALL_QUANTS=1
			
--- a/llama/build_hipblas.sh
+++ b/llama/build_hipblas.sh
@@ -1,96 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-archs=(
			
 
				-    gfx900
			
 
				-    gfx940
			
 
				-    gfx941
			
 
				-    gfx942
			
 
				-    gfx1010
			
 
				-    gfx1012
			
 
				-    gfx1030
			
 
				-    gfx1100
			
 
				-    gfx1101
			
 
				-    gfx1102
			
 
				-)
			
 
				-
			
 
				-linux_archs=(
			
 
				-    gfx906:xnack-
			
 
				-    gfx908:xnack-
			
 
				-    gfx90a:xnack+
			
 
				-    gfx90a:xnack-
			
 
				-)
			
 
				-
			
 
				-os="$(uname -s)"
			
 
				-
			
 
				-additional_flags=""
			
 
				-
			
 
				-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
			
 
				-    output="ggml-hipblas.dll"
			
 
				-    additional_flags=" -Xclang --dependent-lib=msvcrt"
			
 
				-else
			
 
				-    output="libggml-hipblas.so"
			
 
				-    archs+=("${linux_archs[@]}")
			
 
				-fi
			
 
				-
			
 
				-for arch in "${archs[@]}"; do
			
 
				-    additional_flags+=" --offload-arch=$arch"
			
 
				-done
			
 
				-
			
 
				-# Create an array of all source files, expanding globs
			
 
				-sources=(
			
 
				-    $(echo ggml-cuda/template-instances/fattn-wmma*.cu)
			
 
				-    $(echo ggml-cuda/template-instances/mmq*.cu)
			
 
				-    $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)
			
 
				-    $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)
			
 
				-    $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
			
 
				-    ggml-cuda.cu
			
 
				-    $(echo ggml-cuda/*.cu)
			
 
				-    ggml.c
			
 
				-    ggml-backend.c
			
 
				-    ggml-alloc.c
			
 
				-    ggml-quants.c
			
 
				-    sgemm.cpp
			
 
				-)
			
 
				-
			
 
				-# Function to compile a single source file
			
 
				-compile_source() {
			
 
				-    src="$1"
			
 
				-    hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \
			
 
				-          -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \
			
 
				-          -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \
			
 
				-          -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \
			
 
				-          -Wno-deprecated-declarations -Wno-unused-result -I. \
			
 
				-          $additional_flags -o "${src%.cu}.o" "$src"
			
 
				-}
			
 
				-
			
 
				-# Function to handle Ctrl+C
			
 
				-cleanup() {
			
 
				-    echo "Terminating all background processes..."
			
 
				-    kill 0
			
 
				-}
			
 
				-
			
 
				-# Set trap to handle SIGINT (Ctrl+C)
			
 
				-trap cleanup SIGINT
			
 
				-
			
 
				-# Limit the number of concurrent jobs
			
 
				-max_jobs=$(nproc)
			
 
				-job_count=0
			
 
				-
			
 
				-for src in "${sources[@]}"; do
			
 
				-    echo "$src"
			
 
				-    compile_source "$src" &
			
 
				-    job_count=$((job_count + 1))
			
 
				-    if [[ $job_count -ge $max_jobs ]]; then
			
 
				-        wait -n
			
 
				-        job_count=$((job_count - 1))
			
 
				-    fi
			
 
				-done
			
 
				-
			
 
				-wait
			
 
				-
			
 
				-# Link all object files into a shared library
			
 
				-echo "Linking object files..."
			
 
				-hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas
			
 
				-
			
 
				-# Clean up object files after linking
			
 
				-rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o
			
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -23,8 +23,8 @@ package llama
 
				 // #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
			
 
				 // #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
			
 
				 // #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
			
 
				-// #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
			
 
				-// #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib"
			
 
				+// #cgo windows,cuda LDFLAGS: -L${SRCDIR} -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
			
 
				+// #cgo windows,rocm LDFLAGS: -L${SRCDIR} -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
			
 
				 // #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt
			
 
				 // #cgo linux,rocm LDFLAGS: -L/opt/rocm/lib
			
 
				 // #include <stdlib.h>