11 달 전 · e9d15eb277
--- a/llama/Makefile
+++ b/llama/Makefile
@@ -1,146 +1,150 @@
 
															-OS := $(shell uname -s)

														
 
															-ARCH := $(or $(ARCH), $(shell uname -m))

														
 
															-NVCC := nvcc

														
 
															-HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")

														
 
															-HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"

														
 
															-

														
 
															-ifeq ($(ARCH),x86_64)

														
 
															-    ARCH := amd64

														
 
															-endif

														
 
															-

														
 
															-# Determine object file extension based on OS

														
 
															-ifneq (,$(findstring MINGW,$(OS)))

														
 
															-    OBJ_EXT := obj

														
 
															-	SHARED_EXT := dll

														
 
															-else

														
 
															-    OBJ_EXT := o

														
 
															-	SHARED_EXT := so

														
 
															-endif

														
 
															-

														
 
															-CUDA_SRCS := \

														
 
															-    ggml-cuda.cu \

														
 
															-    $(wildcard ggml-cuda/*.cu) \

														
 
															-    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \

														
 
															-    $(wildcard ggml-cuda/template-instances/mmq*.cu) \

														
 
															-    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \

														
 
															-    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \

														
 
															-    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \

														
 
															-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp

														
 
															-

														
 
															-CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))

														
 
															-CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))

														
 
															-CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))

														
 
															-

														
 
															-HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))

														
 
															-HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))

														
 
															-HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))

														
 
															-

														
 
															-CUDA_FLAGS := \

														
 
															-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \

														
 
															-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \

														
 
															-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \

														
 
															-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \

														
 
															-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \

														
 
															-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \

														
 
															-    -DGGML_CUDA_DMMV_X=32 \

														
 
															-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \

														
 
															-    -DGGML_USE_CUDA=1 \

														
 
															-    -DGGML_SHARED=1 \

														
 
															-    -DGGML_BUILD=1 \

														
 
															-    -DGGML_USE_LLAMAFILE \

														
 
															-    -D_GNU_SOURCE \

														
 
															-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \

														
 
															-    -Wno-deprecated-gpu-targets \

														
 
															-    --forward-unknown-to-host-compiler \

														
 
															-    -use_fast_math \

														
 
															-    -link \

														
 
															-    -shared \

														
 
															-    -I. \

														
 
															-    -O3

														
 
															-

														
 
															-HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102

														
 
															-LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-

														
 
															-

														
 
															-HIP_FLAGS := \

														
 
															-    -c \

														
 
															-	-O3 \

														
 
															-	-DGGML_USE_CUDA \

														
 
															-	-DGGML_BUILD=1 \

														
 
															-	-DGGML_SHARED=1 \

														
 
															-    -DGGML_CUDA_DMMV_X=32 \

														
 
															-	-DGGML_CUDA_MMV_Y=1 \

														
 
															-	-DGGML_SCHED_MAX_COPIES=4 \

														
 
															-    -DGGML_USE_HIPBLAS \

														
 
															-	-DGGML_USE_LLAMAFILE \

														
 
															-	-DHIP_FAST_MATH \

														
 
															-	-DNDEBUG \

														
 
															-    -DK_QUANTS_PER_ITERATION=2 \

														
 
															-	-D_CRT_SECURE_NO_WARNINGS \

														
 
															-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \

														
 
															-	-D_GNU_SOURCE \

														
 
															-    -Wno-expansion-to-defined \

														
 
															-	-Wno-invalid-noreturn \

														
 
															-	-Wno-ignored-attributes \

														
 
															-    -Wno-pass-failed \

														
 
															-	-Wno-deprecated-declarations \

														
 
															-	-Wno-unused-result \

														
 
															-	-I. \

														
 
															-    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))

														
 
															-

														
 
															-ifeq ($(UNAME_S), Linux)

														
 
															-    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))

														
 
															-endif

														
 
															-

														
 
															-ifeq ($(OS),Darwin)

														
 
															-    ifeq ($(ARCH),arm64)

														
 
															-        all: ollama_runner

														
 
															-    else ifeq ($(ARCH),amd64)

														
 
															-        all: ollama_runner ollama_runner_avx ollama_runner_avx2

														
 
															-    endif

														
 
															-else

														
 
															-    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm

														
 
															-endif

														
 
															-

														
 
															-%.cuda.$(OBJ_EXT): %.cu

														
 
															-	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<

														
 
															-

														
 
															-%.cuda.$(OBJ_EXT): %.c

														
 
															-	$(NVCC) -c $(CFLAGS) -o $@ $<

														
 
															-

														
 
															-%.cuda.$(OBJ_EXT): %.cpp

														
 
															-	$(NVCC) -c $(CXXFLAGS) -o $@ $<

														
 
															-

														
 
															-ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)

														
 
															-	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@

														
 
															-

														
 
															-%.hip.$(OBJ_EXT): %.cu

														
 
															-	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<

														
 
															-

														
 
															-%.hip.$(OBJ_EXT): %.c

														
 
															-	$(HIPCC) -c $(CFLAGS) -o $@ $<

														
 
															-

														
 
															-%.hip.$(OBJ_EXT): %.cpp

														
 
															-	$(HIPCC) -c $(CXXFLAGS) -o $@ $<

														
 
															-

														
 
															-ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)

														
 
															-	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@

														
 
															-

														
 
															-ollama_runner:

														
 
															-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 

														
 
															-

														
 
															-ollama_runner_avx:

														
 
															-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner

														
 
															-

														
 
															-ollama_runner_avx2:

														
 
															-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner

														
 
															-

														
 
															-ollama_runner_cuda: ggml_cuda.dll

														
 
															-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner

														
 
															-

														
 
															-ollama_runner_rocm: ggml_hipblas.dll

														
 
															-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner

														
 
															-

														
 
															-clean:

														
 
															-	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*

														
 
															-

														
 
															-.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm

														
 
															+OS := $(shell uname -s)
														
 
															+ARCH := $(or $(ARCH), $(shell uname -m))
														
 
															+NVCC := nvcc
														
 
															+
														
 
															+export CGO_CFLAGS_ALLOW = -mfma|-mf16c
														
 
															+export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
														
 
															+
														
 
															+ifeq ($(ARCH),x86_64)
														
 
															+    ARCH := amd64
														
 
															+endif
														
 
															+
														
 
															+ifneq (,$(findstring MINGW,$(OS)))
														
 
															+    OBJ_EXT := obj
														
 
															+	SHARED_EXT := dll
														
 
															+    HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")
														
 
															+else
														
 
															+    OBJ_EXT := o
														
 
															+	SHARED_EXT := so
														
 
															+endif
														
 
															+
														
 
															+CUDA_SRCS := \
														
 
															+    ggml-cuda.cu \
														
 
															+    $(wildcard ggml-cuda/*.cu) \
														
 
															+    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
														
 
															+    $(wildcard ggml-cuda/template-instances/mmq*.cu) \
														
 
															+    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
														
 
															+    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
														
 
															+    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
														
 
															+    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
														
 
															+
														
 
															+CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
														
 
															+CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
														
 
															+CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))
														
 
															+
														
 
															+HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
														
 
															+HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
														
 
															+HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))
														
 
															+
														
 
															+CUDA_FLAGS := \
														
 
															+    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
														
 
															+    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
														
 
															+    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
														
 
															+    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
														
 
															+    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
														
 
															+    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
														
 
															+    -DGGML_CUDA_DMMV_X=32 \
														
 
															+    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
														
 
															+    -DGGML_USE_CUDA=1 \
														
 
															+    -DGGML_SHARED=1 \
														
 
															+    -DGGML_BUILD=1 \
														
 
															+    -DGGML_USE_LLAMAFILE \
														
 
															+    -D_GNU_SOURCE \
														
 
															+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															+    -Wno-deprecated-gpu-targets \
														
 
															+    --forward-unknown-to-host-compiler \
														
 
															+    -use_fast_math \
														
 
															+    -link \
														
 
															+    -shared \
														
 
															+    -I. \
														
 
															+    -O3
														
 
															+
														
 
															+HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
														
 
															+LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
														
 
															+
														
 
															+HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"
														
 
															+HIP_FLAGS := \
														
 
															+    -c \
														
 
															+	-O3 \
														
 
															+	-DGGML_USE_CUDA \
														
 
															+	-DGGML_BUILD=1 \
														
 
															+	-DGGML_SHARED=1 \
														
 
															+    -DGGML_CUDA_DMMV_X=32 \
														
 
															+	-DGGML_CUDA_MMV_Y=1 \
														
 
															+	-DGGML_SCHED_MAX_COPIES=4 \
														
 
															+    -DGGML_USE_HIPBLAS \
														
 
															+	-DGGML_USE_LLAMAFILE \
														
 
															+	-DHIP_FAST_MATH \
														
 
															+	-DNDEBUG \
														
 
															+    -DK_QUANTS_PER_ITERATION=2 \
														
 
															+	-D_CRT_SECURE_NO_WARNINGS \
														
 
															+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															+	-D_GNU_SOURCE \
														
 
															+    -Wno-expansion-to-defined \
														
 
															+	-Wno-invalid-noreturn \
														
 
															+	-Wno-ignored-attributes \
														
 
															+    -Wno-pass-failed \
														
 
															+	-Wno-deprecated-declarations \
														
 
															+	-Wno-unused-result \
														
 
															+    -Xclang \
														
 
															+    --dependent-lib=msvcrt \
														
 
															+	-I. \
														
 
															+    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
														
 
															+
														
 
															+ifeq ($(UNAME_S), Linux)
														
 
															+    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))
														
 
															+endif
														
 
															+
														
 
															+ifeq ($(OS),Darwin)
														
 
															+    ifeq ($(ARCH),arm64)
														
 
															+        all: ollama_runner
														
 
															+    else ifeq ($(ARCH),amd64)
														
 
															+        all: ollama_runner ollama_runner_avx ollama_runner_avx2
														
 
															+    endif
														
 
															+else
														
 
															+    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
														
 
															+endif
														
 
															+
														
 
															+%.cuda.$(OBJ_EXT): %.cu
														
 
															+	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
														
 
															+
														
 
															+%.cuda.$(OBJ_EXT): %.c
														
 
															+	$(NVCC) -c $(CFLAGS) -o $@ $<
														
 
															+
														
 
															+%.cuda.$(OBJ_EXT): %.cpp
														
 
															+	$(NVCC) -c $(CXXFLAGS) -o $@ $<
														
 
															+
														
 
															+ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
														
 
															+	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
														
 
															+
														
 
															+%.hip.$(OBJ_EXT): %.cu
														
 
															+	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
														
 
															+
														
 
															+%.hip.$(OBJ_EXT): %.c
														
 
															+	$(HIPCC) -c $(CFLAGS) -o $@ $<
														
 
															+
														
 
															+%.hip.$(OBJ_EXT): %.cpp
														
 
															+	$(HIPCC) -c $(CXXFLAGS) -o $@ $<
														
 
															+
														
 
															+ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
														
 
															+	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
														
 
															+
														
 
															+ollama_runner:
														
 
															+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 
														
 
															+
														
 
															+ollama_runner_avx:
														
 
															+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
														
 
															+
														
 
															+ollama_runner_avx2:
														
 
															+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
														
 
															+
														
 
															+ollama_runner_cuda: ggml_cuda.dll
														
 
															+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
														
 
															+
														
 
															+ollama_runner_rocm: ggml_hipblas.dll
														
 
															+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
														
 
															+
														
 
															+clean:
														
 
															+	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*
														
 
															+
														
 
															+.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
														
--- a/llama/README.md
+++ b/llama/README.md
@@ -13,11 +13,6 @@ Supported:
 
															 - [x] Linux ROCm
														
 
															 - [x] Llava
														
 
															-Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these small dlls are created:
														
 
															-
														
 
															-- `ggml-cuda.dll`
														
 
															-- `ggml-hipblas.dll`
														
 
															-
														
 
															 > Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
														
 
															 ## Building
														
@@ -46,11 +41,7 @@ go build -tags=avx,avx2 .
 
															 ### CUDA
														
 
															-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build `libggml-cuda.so`:
														
 
															-
														
 
															-```shell
														
 
															-./build_cuda.sh
														
 
															-```
														
 
															+Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
														
 
															 Then build the package with the `cuda` tag:
														
@@ -69,7 +60,7 @@ Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-down
 
															 Build `ggml-cuda.dll`:
														
 
															 ```shell
														
 
															-./build_cuda.ps1
														
 
															+make ggml_cuda.dll
														
 
															 ```
														
 
															 Then build the package with the `cuda` tag:
														
@@ -82,10 +73,8 @@ go build -tags=cuda .
 
															 Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) and [Strawberry Perl](https://strawberryperl.com/).
														
 
															-Then, build `ggml-hipblas.dll`:
														
 
															-
														
 
															 ```shell
														
 
															-./build_hipblas.sh
														
 
															+make ggml_hipblas.dll
														
 
															 ```
														
 
															 Then build the package with the `rocm` tag:
														
--- a/llama/build_cuda.sh
+++ b/llama/build_cuda.sh
@@ -1,47 +0,0 @@
 
															-#!/bin/bash
														
 
															-
														
 
															-os="$(uname -s)"
														
 
															-
														
 
															-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
														
 
															-    output="ggml-cuda.dll"
														
 
															-else
														
 
															-    output="libggml-cuda.so"
														
 
															-fi
														
 
															-
														
 
															-nvcc \
														
 
															-    -t $(nproc) \
														
 
															-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
														
 
															-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
														
 
															-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
														
 
															-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
														
 
															-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
														
 
															-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
														
 
															-    -DGGML_CUDA_DMMV_X=32 \
														
 
															-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
														
 
															-    -DGGML_CUDA_MMV_Y=1 \
														
 
															-    -DGGML_USE_CUDA=1 \
														
 
															-    -DGGML_SHARED=1 \
														
 
															-    -DGGML_BUILD=1 \
														
 
															-    -DGGML_USE_LLAMAFILE \
														
 
															-    -D_GNU_SOURCE \
														
 
															-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															-    -Wno-deprecated-gpu-targets \
														
 
															-    --forward-unknown-to-host-compiler \
														
 
															-    -use_fast_math \
														
 
															-    -link \
														
 
															-    -shared \
														
 
															-    -I. \
														
 
															-    -lcuda -lcublas -lcudart -lcublasLt \
														
 
															-    -O3 \
														
 
															-    -o $output \
														
 
															-    ggml-cuda.cu \
														
 
															-    ggml-cuda/*.cu \
														
 
															-    ggml-cuda/template-instances/fattn-wmma*.cu \
														
 
															-    ggml-cuda/template-instances/mmq*.cu \
														
 
															-    ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \
														
 
															-    ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \
														
 
															-    ggml-cuda/template-instances/fattn-vec*f16-f16.cu \
														
 
															-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
														
 
															-
														
 
															-#   -DGGML_CUDA_USE_GRAPHS=1 
														
 
															-#   -DGGML_CUDA_FA_ALL_QUANTS=1
														
--- a/llama/build_hipblas.sh
+++ b/llama/build_hipblas.sh
@@ -1,96 +0,0 @@
 
															-#!/bin/bash
														
 
															-
														
 
															-archs=(
														
 
															-    gfx900
														
 
															-    gfx940
														
 
															-    gfx941
														
 
															-    gfx942
														
 
															-    gfx1010
														
 
															-    gfx1012
														
 
															-    gfx1030
														
 
															-    gfx1100
														
 
															-    gfx1101
														
 
															-    gfx1102
														
 
															-)
														
 
															-
														
 
															-linux_archs=(
														
 
															-    gfx906:xnack-
														
 
															-    gfx908:xnack-
														
 
															-    gfx90a:xnack+
														
 
															-    gfx90a:xnack-
														
 
															-)
														
 
															-
														
 
															-os="$(uname -s)"
														
 
															-
														
 
															-additional_flags=""
														
 
															-
														
 
															-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
														
 
															-    output="ggml-hipblas.dll"
														
 
															-    additional_flags=" -Xclang --dependent-lib=msvcrt"
														
 
															-else
														
 
															-    output="libggml-hipblas.so"
														
 
															-    archs+=("${linux_archs[@]}")
														
 
															-fi
														
 
															-
														
 
															-for arch in "${archs[@]}"; do
														
 
															-    additional_flags+=" --offload-arch=$arch"
														
 
															-done
														
 
															-
														
 
															-# Create an array of all source files, expanding globs
														
 
															-sources=(
														
 
															-    $(echo ggml-cuda/template-instances/fattn-wmma*.cu)
														
 
															-    $(echo ggml-cuda/template-instances/mmq*.cu)
														
 
															-    $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)
														
 
															-    $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)
														
 
															-    $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
														
 
															-    ggml-cuda.cu
														
 
															-    $(echo ggml-cuda/*.cu)
														
 
															-    ggml.c
														
 
															-    ggml-backend.c
														
 
															-    ggml-alloc.c
														
 
															-    ggml-quants.c
														
 
															-    sgemm.cpp
														
 
															-)
														
 
															-
														
 
															-# Function to compile a single source file
														
 
															-compile_source() {
														
 
															-    src="$1"
														
 
															-    hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \
														
 
															-          -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \
														
 
															-          -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \
														
 
															-          -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \
														
 
															-          -Wno-deprecated-declarations -Wno-unused-result -I. \
														
 
															-          $additional_flags -o "${src%.cu}.o" "$src"
														
 
															-}
														
 
															-
														
 
															-# Function to handle Ctrl+C
														
 
															-cleanup() {
														
 
															-    echo "Terminating all background processes..."
														
 
															-    kill 0
														
 
															-}
														
 
															-
														
 
															-# Set trap to handle SIGINT (Ctrl+C)
														
 
															-trap cleanup SIGINT
														
 
															-
														
 
															-# Limit the number of concurrent jobs
														
 
															-max_jobs=$(nproc)
														
 
															-job_count=0
														
 
															-
														
 
															-for src in "${sources[@]}"; do
														
 
															-    echo "$src"
														
 
															-    compile_source "$src" &
														
 
															-    job_count=$((job_count + 1))
														
 
															-    if [[ $job_count -ge $max_jobs ]]; then
														
 
															-        wait -n
														
 
															-        job_count=$((job_count - 1))
														
 
															-    fi
														
 
															-done
														
 
															-
														
 
															-wait
														
 
															-
														
 
															-# Link all object files into a shared library
														
 
															-echo "Linking object files..."
														
 
															-hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas
														
 
															-
														
 
															-# Clean up object files after linking
														
 
															-rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o
														
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -23,8 +23,8 @@ package llama
 
															 // #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
														
 
															 // #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
														
 
															 // #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
														
 
															-// #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
														
 
															-// #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib"
														
 
															+// #cgo windows,cuda LDFLAGS: -L${SRCDIR} -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
														
 
															+// #cgo windows,rocm LDFLAGS: -L${SRCDIR} -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
														
 
															 // #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt
														
 
															 // #cgo linux,rocm LDFLAGS: -L/opt/rocm/lib
														
 
															 // #include <stdlib.h>