소스 검색

remove build scripts

jmorganca 11 달 전
부모
커밋
e9d15eb277
5개의 변경된 파일155개의 추가작업 그리고 305개의 파일을 삭제
  1. 150 146
      llama/Makefile
  2. 3 14
      llama/README.md
  3. 0 47
      llama/build_cuda.sh
  4. 0 96
      llama/build_hipblas.sh
  5. 2 2
      llama/llama.go

+ 150 - 146
llama/Makefile

@@ -1,146 +1,150 @@
-OS := $(shell uname -s)
-ARCH := $(or $(ARCH), $(shell uname -m))
-NVCC := nvcc
-HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")
-HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"
-
-ifeq ($(ARCH),x86_64)
-    ARCH := amd64
-endif
-
-# Determine object file extension based on OS
-ifneq (,$(findstring MINGW,$(OS)))
-    OBJ_EXT := obj
-	SHARED_EXT := dll
-else
-    OBJ_EXT := o
-	SHARED_EXT := so
-endif
-
-CUDA_SRCS := \
-    ggml-cuda.cu \
-    $(wildcard ggml-cuda/*.cu) \
-    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
-    $(wildcard ggml-cuda/template-instances/mmq*.cu) \
-    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
-
-CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
-CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
-CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))
-
-HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
-HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
-HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))
-
-CUDA_FLAGS := \
-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
-    -DGGML_CUDA_DMMV_X=32 \
-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-    -DGGML_USE_CUDA=1 \
-    -DGGML_SHARED=1 \
-    -DGGML_BUILD=1 \
-    -DGGML_USE_LLAMAFILE \
-    -D_GNU_SOURCE \
-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
-    -Wno-deprecated-gpu-targets \
-    --forward-unknown-to-host-compiler \
-    -use_fast_math \
-    -link \
-    -shared \
-    -I. \
-    -O3
-
-HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
-LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
-
-HIP_FLAGS := \
-    -c \
-	-O3 \
-	-DGGML_USE_CUDA \
-	-DGGML_BUILD=1 \
-	-DGGML_SHARED=1 \
-    -DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_SCHED_MAX_COPIES=4 \
-    -DGGML_USE_HIPBLAS \
-	-DGGML_USE_LLAMAFILE \
-	-DHIP_FAST_MATH \
-	-DNDEBUG \
-    -DK_QUANTS_PER_ITERATION=2 \
-	-D_CRT_SECURE_NO_WARNINGS \
-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
-	-D_GNU_SOURCE \
-    -Wno-expansion-to-defined \
-	-Wno-invalid-noreturn \
-	-Wno-ignored-attributes \
-    -Wno-pass-failed \
-	-Wno-deprecated-declarations \
-	-Wno-unused-result \
-	-I. \
-    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
-
-ifeq ($(UNAME_S), Linux)
-    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))
-endif
-
-ifeq ($(OS),Darwin)
-    ifeq ($(ARCH),arm64)
-        all: ollama_runner
-    else ifeq ($(ARCH),amd64)
-        all: ollama_runner ollama_runner_avx ollama_runner_avx2
-    endif
-else
-    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
-endif
-
-%.cuda.$(OBJ_EXT): %.cu
-	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
-
-%.cuda.$(OBJ_EXT): %.c
-	$(NVCC) -c $(CFLAGS) -o $@ $<
-
-%.cuda.$(OBJ_EXT): %.cpp
-	$(NVCC) -c $(CXXFLAGS) -o $@ $<
-
-ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
-	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
-
-%.hip.$(OBJ_EXT): %.cu
-	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
-
-%.hip.$(OBJ_EXT): %.c
-	$(HIPCC) -c $(CFLAGS) -o $@ $<
-
-%.hip.$(OBJ_EXT): %.cpp
-	$(HIPCC) -c $(CXXFLAGS) -o $@ $<
-
-ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
-	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
-
-ollama_runner:
-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 
-
-ollama_runner_avx:
-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
-
-ollama_runner_avx2:
-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
-
-ollama_runner_cuda: ggml_cuda.dll
-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
-
-ollama_runner_rocm: ggml_hipblas.dll
-	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
-
-clean:
-	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*
-
-.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
+OS := $(shell uname -s)
+ARCH := $(or $(ARCH), $(shell uname -m))
+NVCC := nvcc
+
+export CGO_CFLAGS_ALLOW = -mfma|-mf16c
+export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
+
+ifeq ($(ARCH),x86_64)
+    ARCH := amd64
+endif
+
+ifneq (,$(findstring MINGW,$(OS)))
+    OBJ_EXT := obj
+	SHARED_EXT := dll
+    HIP_PATH := $(shell cygpath -w -s "$(HIP_PATH)")
+else
+    OBJ_EXT := o
+	SHARED_EXT := so
+endif
+
+CUDA_SRCS := \
+    ggml-cuda.cu \
+    $(wildcard ggml-cuda/*.cu) \
+    $(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
+    $(wildcard ggml-cuda/template-instances/mmq*.cu) \
+    $(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
+    $(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
+    $(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
+    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
+
+CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
+CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
+CUDA_OBJS := $(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT))
+
+HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
+HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
+HIP_OBJS := $(HIP_OBJS:.cpp=.hip.$(OBJ_EXT))
+
+CUDA_FLAGS := \
+    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
+    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
+    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
+    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
+    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
+    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
+    -DGGML_CUDA_DMMV_X=32 \
+    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
+    -DGGML_USE_CUDA=1 \
+    -DGGML_SHARED=1 \
+    -DGGML_BUILD=1 \
+    -DGGML_USE_LLAMAFILE \
+    -D_GNU_SOURCE \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
+    -Wno-deprecated-gpu-targets \
+    --forward-unknown-to-host-compiler \
+    -use_fast_math \
+    -link \
+    -shared \
+    -I. \
+    -O3
+
+HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
+LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
+
+HIPCC := "$(HIP_PATH)/bin/hipcc.bin.exe"
+HIP_FLAGS := \
+    -c \
+	-O3 \
+	-DGGML_USE_CUDA \
+	-DGGML_BUILD=1 \
+	-DGGML_SHARED=1 \
+    -DGGML_CUDA_DMMV_X=32 \
+	-DGGML_CUDA_MMV_Y=1 \
+	-DGGML_SCHED_MAX_COPIES=4 \
+    -DGGML_USE_HIPBLAS \
+	-DGGML_USE_LLAMAFILE \
+	-DHIP_FAST_MATH \
+	-DNDEBUG \
+    -DK_QUANTS_PER_ITERATION=2 \
+	-D_CRT_SECURE_NO_WARNINGS \
+    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
+	-D_GNU_SOURCE \
+    -Wno-expansion-to-defined \
+	-Wno-invalid-noreturn \
+	-Wno-ignored-attributes \
+    -Wno-pass-failed \
+	-Wno-deprecated-declarations \
+	-Wno-unused-result \
+    -Xclang \
+    --dependent-lib=msvcrt \
+	-I. \
+    $(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
+
+ifeq ($(UNAME_S), Linux)
+    HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch))
+endif
+
+ifeq ($(OS),Darwin)
+    ifeq ($(ARCH),arm64)
+        all: ollama_runner
+    else ifeq ($(ARCH),amd64)
+        all: ollama_runner ollama_runner_avx ollama_runner_avx2
+    endif
+else
+    all: ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm
+endif
+
+%.cuda.$(OBJ_EXT): %.cu
+	$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
+
+%.cuda.$(OBJ_EXT): %.c
+	$(NVCC) -c $(CFLAGS) -o $@ $<
+
+%.cuda.$(OBJ_EXT): %.cpp
+	$(NVCC) -c $(CXXFLAGS) -o $@ $<
+
+ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
+	nvcc --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
+
+%.hip.$(OBJ_EXT): %.cu
+	$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
+
+%.hip.$(OBJ_EXT): %.c
+	$(HIPCC) -c $(CFLAGS) -o $@ $<
+
+%.hip.$(OBJ_EXT): %.cpp
+	$(HIPCC) -c $(CXXFLAGS) -o $@ $<
+
+ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
+	$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
+
+ollama_runner:
+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner 
+
+ollama_runner_avx:
+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
+
+ollama_runner_avx2:
+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
+
+ollama_runner_cuda: ggml_cuda.dll
+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
+
+ollama_runner_rocm: ggml_hipblas.dll
+	CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
+
+clean:
+	rm -f $(CUDA_OBJS) $(HIP_OBJS) ggml_cuda.$(SHARED_EXT) ggml_cuda.* ggml_hipblas.* ollama_runner*
+
+.PHONY: all clean ollama_runner ollama_runner_avx ollama_runner_avx2 ollama_runner_cuda ollama_runner_rocm

+ 3 - 14
llama/README.md

@@ -13,11 +13,6 @@ Supported:
 - [x] Linux ROCm
 - [x] Linux ROCm
 - [x] Llava
 - [x] Llava
 
 
-Extra build steps are required for CUDA and ROCm on Windows since `nvcc` and `hipcc` both require using msvc as the host compiler. For these small dlls are created:
-
-- `ggml-cuda.dll`
-- `ggml-hipblas.dll`
-
 > Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
 > Note: it's important that memory is allocated and freed by the same compiler (e.g. entirely by code compiled with msvc or mingw). Issues from this should be rare, but there are some places where pointers are returned by the CUDA or HIP runtimes and freed elsewhere, causing a a crash. In a future change the same runtime should be used in both cases to avoid crashes.
 
 
 ## Building
 ## Building
@@ -46,11 +41,7 @@ go build -tags=avx,avx2 .
 
 
 ### CUDA
 ### CUDA
 
 
-Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive) then build `libggml-cuda.so`:
-
-```shell
-./build_cuda.sh
-```
+Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-download-archive):
 
 
 Then build the package with the `cuda` tag:
 Then build the package with the `cuda` tag:
 
 
@@ -69,7 +60,7 @@ Install the [CUDA toolkit v11.3.1](https://developer.nvidia.com/cuda-11-3-1-down
 Build `ggml-cuda.dll`:
 Build `ggml-cuda.dll`:
 
 
 ```shell
 ```shell
-./build_cuda.ps1
+make ggml_cuda.dll
 ```
 ```
 
 
 Then build the package with the `cuda` tag:
 Then build the package with the `cuda` tag:
@@ -82,10 +73,8 @@ go build -tags=cuda .
 
 
 Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) and [Strawberry Perl](https://strawberryperl.com/).
 Install [ROCm 5.7.1](https://rocm.docs.amd.com/en/docs-5.7.1/) and [Strawberry Perl](https://strawberryperl.com/).
 
 
-Then, build `ggml-hipblas.dll`:
-
 ```shell
 ```shell
-./build_hipblas.sh
+make ggml_hipblas.dll
 ```
 ```
 
 
 Then build the package with the `rocm` tag:
 Then build the package with the `rocm` tag:

+ 0 - 47
llama/build_cuda.sh

@@ -1,47 +0,0 @@
-#!/bin/bash
-
-os="$(uname -s)"
-
-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
-    output="ggml-cuda.dll"
-else
-    output="libggml-cuda.so"
-fi
-
-nvcc \
-    -t $(nproc) \
-    --generate-code=arch=compute_50,code=[compute_50,sm_50] \
-    --generate-code=arch=compute_52,code=[compute_52,sm_52] \
-    --generate-code=arch=compute_61,code=[compute_61,sm_61] \
-    --generate-code=arch=compute_70,code=[compute_70,sm_70] \
-    --generate-code=arch=compute_75,code=[compute_75,sm_75] \
-    --generate-code=arch=compute_80,code=[compute_80,sm_80] \
-    -DGGML_CUDA_DMMV_X=32 \
-    -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-    -DGGML_CUDA_MMV_Y=1 \
-    -DGGML_USE_CUDA=1 \
-    -DGGML_SHARED=1 \
-    -DGGML_BUILD=1 \
-    -DGGML_USE_LLAMAFILE \
-    -D_GNU_SOURCE \
-    -DCMAKE_POSITION_INDEPENDENT_CODE=on \
-    -Wno-deprecated-gpu-targets \
-    --forward-unknown-to-host-compiler \
-    -use_fast_math \
-    -link \
-    -shared \
-    -I. \
-    -lcuda -lcublas -lcudart -lcublasLt \
-    -O3 \
-    -o $output \
-    ggml-cuda.cu \
-    ggml-cuda/*.cu \
-    ggml-cuda/template-instances/fattn-wmma*.cu \
-    ggml-cuda/template-instances/mmq*.cu \
-    ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu \
-    ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu \
-    ggml-cuda/template-instances/fattn-vec*f16-f16.cu \
-    ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
-
-#   -DGGML_CUDA_USE_GRAPHS=1 
-#   -DGGML_CUDA_FA_ALL_QUANTS=1

+ 0 - 96
llama/build_hipblas.sh

@@ -1,96 +0,0 @@
-#!/bin/bash
-
-archs=(
-    gfx900
-    gfx940
-    gfx941
-    gfx942
-    gfx1010
-    gfx1012
-    gfx1030
-    gfx1100
-    gfx1101
-    gfx1102
-)
-
-linux_archs=(
-    gfx906:xnack-
-    gfx908:xnack-
-    gfx90a:xnack+
-    gfx90a:xnack-
-)
-
-os="$(uname -s)"
-
-additional_flags=""
-
-if [[ "$os" == "Windows_NT" || "$os" == "MINGW64_NT"* ]]; then
-    output="ggml-hipblas.dll"
-    additional_flags=" -Xclang --dependent-lib=msvcrt"
-else
-    output="libggml-hipblas.so"
-    archs+=("${linux_archs[@]}")
-fi
-
-for arch in "${archs[@]}"; do
-    additional_flags+=" --offload-arch=$arch"
-done
-
-# Create an array of all source files, expanding globs
-sources=(
-    $(echo ggml-cuda/template-instances/fattn-wmma*.cu)
-    $(echo ggml-cuda/template-instances/mmq*.cu)
-    $(echo ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu)
-    $(echo ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu)
-    $(echo ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
-    ggml-cuda.cu
-    $(echo ggml-cuda/*.cu)
-    ggml.c
-    ggml-backend.c
-    ggml-alloc.c
-    ggml-quants.c
-    sgemm.cpp
-)
-
-# Function to compile a single source file
-compile_source() {
-    src="$1"
-    hipcc -c -O3 -DGGML_USE_CUDA -DGGML_BUILD=1 -DGGML_SHARED=1 -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 \
-          -DGGML_SCHED_MAX_COPIES=4 -DGGML_USE_HIPBLAS -DGGML_USE_LLAMAFILE -DHIP_FAST_MATH -DNDEBUG \
-          -DK_QUANTS_PER_ITERATION=2 -D_CRT_SECURE_NO_WARNINGS -DCMAKE_POSITION_INDEPENDENT_CODE=on \
-          -D_GNU_SOURCE -Wno-expansion-to-defined -Wno-invalid-noreturn -Wno-ignored-attributes -Wno-pass-failed \
-          -Wno-deprecated-declarations -Wno-unused-result -I. \
-          $additional_flags -o "${src%.cu}.o" "$src"
-}
-
-# Function to handle Ctrl+C
-cleanup() {
-    echo "Terminating all background processes..."
-    kill 0
-}
-
-# Set trap to handle SIGINT (Ctrl+C)
-trap cleanup SIGINT
-
-# Limit the number of concurrent jobs
-max_jobs=$(nproc)
-job_count=0
-
-for src in "${sources[@]}"; do
-    echo "$src"
-    compile_source "$src" &
-    job_count=$((job_count + 1))
-    if [[ $job_count -ge $max_jobs ]]; then
-        wait -n
-        job_count=$((job_count - 1))
-    fi
-done
-
-wait
-
-# Link all object files into a shared library
-echo "Linking object files..."
-hipcc -v -shared -o $output *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o -lhipblas -lamdhip64 -lrocblas
-
-# Clean up object files after linking
-rm -f *.o ggml-cuda/*.o ggml-cuda/template-instances/*.o

+ 2 - 2
llama/llama.go

@@ -23,8 +23,8 @@ package llama
 // #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 // #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
 // #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
-// #cgo windows,cuda LDFLAGS: -L. -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
-// #cgo windows,rocm LDFLAGS: -L. -L"C:/Program Files/AMD/ROCm/5.7/lib"
+// #cgo windows,cuda LDFLAGS: -L${SRCDIR} -L"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.3/lib/x64" -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt
+// #cgo windows,rocm LDFLAGS: -L${SRCDIR} -L"C:/Program Files/AMD/ROCm/5.7/lib" -lggml_hipblas -lhipblas -lamdhip64 -lrocblas
 // #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt
 // #cgo linux,cuda LDFLAGS: -L${SRCDIR} -L/usr/local/cuda/lib64 -lggml_cuda -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt
 // #cgo linux,rocm LDFLAGS: -L/opt/rocm/lib
 // #cgo linux,rocm LDFLAGS: -L/opt/rocm/lib
 // #include <stdlib.h>
 // #include <stdlib.h>