Browse Source

build: recursive make ggml-cuda

Michael Yang 4 months ago
parent
commit
054e31146e

+ 112 - 0
Makefile2

@@ -0,0 +1,112 @@
+export GOOS?=$(shell go env GOOS)
+export GOARCH?=$(shell go env GOARCH)
+
+build: llama/build/$(GOOS)-$(GOARCH)
+
+export GOFLAGS=-trimpath
+
+llama/build/%/runners/metal: GOFLAGS+=-tags=metal
+llama/build/%/runners/cpu_avx: GOFLAGS+=-tags=avx
+llama/build/%/runners/cpu_avx2: GOFLAGS+=-tags=avx2
+llama/build/%/runners/cuda_v11: GOFLAGS+=-tags=cuda,cuda_v11
+llama/build/%/runners/cuda_v12: GOFLAGS+=-tags=cuda,cuda_v12
+llama/build/%/runners/rocm: GOFLAGS+=-tags=cuda,rocm
+
+.PHONY: llama/build/darwin-amd64 llama/build/darwin-arm64
+llama/build/darwin-amd64: llama/build/darwin-amd64/runners/cpu_avx
+llama/build/darwin-arm64: llama/build/darwin-arm64/runners/metal
+
+.PHONY: llama/build/linux-amd64 llama/build/linux-arm64
+llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx
+llama/build/linux-amd64: llama/build/linux-amd64/runners/cpu_avx2
+llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx
+llama/build/linux-arm64: llama/build/linux-arm64/runners/cpu_avx2
+
+.PHONY: llama/build/windows-amd64 linux/build/windows-arm64
+llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx
+llama/build/windows-amd64: llama/build/windows-amd64/runners/cpu_avx2
+llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx
+llama/build/windows-arm64: llama/build/windows-arm64/runners/cpu_avx2
+
+.PHONY: cuda_v11 cuda_v12
+cuda_v11 cuda_v12 rocm:
+	$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $@
+
+ifeq ($(GOOS),linux)
+NVCC11=$(shell command -v /usr/local/cuda-11/bin/nvcc)
+NVCC12=$(shell command -v /usr/local/cuda-12/bin/nvcc)
+HIPCC=$(shell command -v hipcc)
+else ifeq ($(GOOS),windows)
+NVCC11=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.?\\bin\\nvcc.exe")
+NVCC12=$(shell ls "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.?\\bin\\nvcc.exe")
+HIPCC=$(shell command -v hipcc)
+endif
+
+ifneq ($(NVCC11),)
+ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_11),)
+CUDA_V11_TARGETS= \
+	llama/build/linux-amd64/runners/cuda_v11 \
+	llama/build/linux-arm64/runners/cuda_v11 \
+	llama/build/windows-amd64/runners/cuda_v11 \
+	llama/build/windows-arm64/runners/cuda_v11 \
+
+$(CUDA_V11_TARGETS): cuda_v11
+cuda_v11: export NVCC=$(NVCC11)
+
+llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v11
+llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v11
+llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v11
+llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v11
+endif
+endif
+
+ifneq ($(NVCC12),)
+ifeq ($(OLLAMA_SKIP_GENERATE_CUDA_12),)
+CUDA_V12_TARGETS= \
+	llama/build/linux-amd64/runners/cuda_v12 \
+	llama/build/linux-arm64/runners/cuda_v12 \
+	llama/build/windows-amd64/runners/cuda_v12 \
+	llama/build/windows-arm64/runners/cuda_v12 \
+
+$(CUDA_V12_TARGETS): cuda_v12
+cuda_v12: export NVCC=$(NVCC12)
+
+llama/build/linux-amd64: llama/build/linux-amd64/runners/cuda_v12
+llama/build/linux-arm64: llama/build/linux-arm64/runners/cuda_v12
+llama/build/windows-amd64: llama/build/windows-amd64/runners/cuda_v12
+llama/build/windows-arm64: llama/build/windows-arm64/runners/cuda_v12
+endif
+endif
+
+ifneq ($(HIPCC),)
+ifeq ($(OLLAMA_SKIP_GENERATE_ROCM),)
+ROCM_TARGETS= \
+	llama/build/linux-amd64/runners/rocm \
+	llama/build/linux-arm64/runners/rocm \
+	llama/build/windows-amd64/runners/rocm \
+	llama/build/windows-arm64/runners/rocm \
+
+$(ROCM_TARGETS): rocm
+rocm: export NVCC=$(HIPCC)
+
+llama/build/linux-amd64: llama/build/linux-amd64/runners/rocm
+llama/build/linux-arm64: llama/build/linux-arm64/runners/rocm
+llama/build/windows-amd64: llama/build/windows-amd64/runners/rocm
+llama/build/windows-arm64: llama/build/windows-arm64/runners/rocm
+endif
+endif
+
+export CGO_ENABLED=1
+export CGO_CPPFLAGS_ALLOW=-mfma|-mf16c
+
+llama/build/%: cmd/runner always
+	mkdir -p $@; go build -o $@ ./$<
+
+.PHONY: always
+always:
+
+clean:
+	$(RM) -r llama/build
+
+realclean: clean
+	$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $<

+ 1 - 2
llama/README.md

@@ -37,8 +37,7 @@ go build -tags avx .
 ```shell
 # go doesn't recognize `-mfma` as a valid compiler flag
 # see https://github.com/golang/go/issues/17895
-go env -w "CGO_CFLAGS_ALLOW=-mfma|-mf16c"
-go env -w "CGO_CXXFLAGS_ALLOW=-mfma|-mf16c"
+go env -w "CGO_CPPFLAGS_ALLOW=-mfma|-mf16c"
 go build -tags=avx,avx2 .
 ```
 

+ 0 - 1
make/common-defs.make

@@ -94,4 +94,3 @@ CPPFLAGS += \
 	-I../ml/backend/ggml/ggml/include \
 	-I../ml/backend/ggml/ggml/ggml-cpu \
 	-I../ml/backend/ggml/ggml/ggml-cpu/amx \
->>>>>>> 22320f09 (preserve vendor directory structure):llama/make/common-defs.make

+ 9 - 52
make/gpu.make

@@ -11,74 +11,31 @@ GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERS
 # today, cuda is bundled, but rocm is split out.  Should split them each out by runner
 DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
 
-
-GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-
-GPU_RUNNER_SRCS := \
-	$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
-	$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
-	llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c llama/ggml-threading.cpp
-GPU_RUNNER_HDRS := \
-	$(wildcard llama/ggml-cuda/*.cuh)
-
-
-# Conditional flags and components to speed up developer builds
-ifneq ($(OLLAMA_FAST_BUILD),)
-	GPU_COMPILER_CUFLAGS += 	\
-		-DGGML_DISABLE_FLASH_ATTN
-else
-	GPU_RUNNER_SRCS += \
-		$(wildcard llama/ggml-cuda/fattn*.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
-endif
-
-GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
-
 DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
 
-
-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 
+$(GPU_RUNNER_NAME): $(BUILD_RUNNERS)
 
 dist: $(DIST_RUNNERS)
 
 # Build targets
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
 $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+	@-mkdir -p $(@D)
+	$(MAKE) -C ml/backend/ggml/ggml/ggml-cuda $(GPU_RUNNER_NAME) CXX=$(GPU_COMPILER)
 	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
 
 # Distribution targets
 $(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
+	@-mkdir -p $(@D)
 	$(CP) $< $@
 $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
 $(GPU_DIST_LIB_DEPS):
-	@-mkdir -p $(dir $@)
-	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
+	@-mkdir -p $(@D)
+	$(CP) $(GPU_LIB_DIR)/$(@F) $(@D)
 
-clean: 
-	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
+clean:
+	$(RM) $(BUILD_RUNNERS) $(DIST_RUNNERS)
 
 .PHONY: clean $(GPU_RUNNER_NAME)
 

+ 1 - 2
ml/backend/ggml/ggml/ggml-cpu/cpu.go

@@ -4,8 +4,7 @@ package cpu
 // #cgo CPPFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../include -I${SRCDIR}/amx
 // #cgo CPPFLAGS: -D_GNU_SOURCE
 // #cgo amd64,avx CPPFLAGS: -mavx
-// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma
-// #cgo amd64,f16c CPPFLAGS: -mf16c
+// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c
 // #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 import "C"
 import (

+ 1 - 0
ml/backend/ggml/ggml/ggml-cuda/.gitignore

@@ -0,0 +1 @@
+*.o

+ 64 - 0
ml/backend/ggml/ggml/ggml-cuda/Makefile

@@ -0,0 +1,64 @@
+NVCC?=nvcc
+
+NVCC_PREPEND_FLAGS= \
+	-t 2 \
+	-split-compile 0 \
+	-std=c++17 \
+	-I.. \
+	-I../include \
+	$(foreach ARCH,$(subst ;, ,$(CUDA_ARCHS)),--generate-code=arch=compute_$(ARCH),code=sm_$(ARCH)) \
+
+NVCC_APPEND_FLAGS= \
+	# -DGGML_CUDA_USE_GRAPHS=1 \
+
+ALL_SOURCES=$(wildcard *.cu)
+FATTN_SOURCES=$(wildcard fattn*.cu)
+
+SOURCES= \
+	$(filter-out $(FATTN_SOURCES),$(ALL_SOURCES)) \
+	$(wildcard template-instances/mmq*.cu) \
+
+ifneq ($(OLLAMA_FAST_BUILD),)
+NVCC_APPEND_FLAGS+=-DGGML_DISABLE_FLASH_ATTN
+else
+SOURCES+= \
+	$(FATTN_SOURCES) \
+	$(wildcard template-instances/fattn-wmma*.cu) \
+	$(wildcard template-instances/fattn-vec*q4_0-q4_0.cu) \
+	$(wildcard template-instances/fattn-vec*q8_0-q8_0.cu) \
+	$(wildcard template-instances/fattn-vec*f16-f16.cu)
+endif
+
+all: cuda_v11 cuda_v12
+
+cuda_v11: CUDA_ARCHS?=50;52;53;60;61;62;70;72;75;80;86
+cuda_v11: OBJECTS=$(patsubst %.cu,%.v11.o,$(SOURCES))
+
+cuda_v12: CUDA_ARCHS?=60;61;62;70;72;75;80;86;87;89;90;90a
+cuda_v12: OBJECTS=$(patsubst %.cu,%.v12.o,$(SOURCES))
+
+rocm: CPPFLAGS+=-DGGML_USE_HIP
+rocm: OBJECTS=$(patsubst %.cu,%.rocm.o,$(SOURCES))
+
+ifeq ($(OS),Windows_NT)
+cuda_v11: libggml_cuda_v11.dll
+cuda_v12: libggml_cuda_v12.dll
+rocm: libggml_rocm.dll
+else
+cuda_v11: libggml_cuda_v11.a
+cuda_v12: libggml_cuda_v12.a
+rocm: libggml_rocm.a
+endif
+
+clean:
+	$(RM) *.a *.o template-instances/*.o
+
+%.v11.o %.v12.o %.rocm.o: %.cu
+	$(NVCC) $(NVCC_PREPEND_FLAGS) -c $< -o $@ $(NVCC_APPEND_FLAGS)
+
+.SECONDEXPANSION:
+%.a: $$(OBJECTS)
+	$(AR) rcs $@ $^
+
+%.dll: $$(OBJECTS)
+	$(NVCC) -shared -o $@ $^

+ 4 - 0
ml/backend/ggml/ggml/ggml-cuda/cuda.go

@@ -1,3 +1,7 @@
 package cuda
 
+// #cgo cuda_v11 LDFLAGS: -L. -lggml_cuda_v11
+// #cgo cuda_v12 LDFLAGS: -L. -lggml_cuda_v12
+// #cgo cuda_v11 cuda_v12 LDFLAGS: -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcublasLt
+// #cgo rocm LDFLAGS: -L. -lggml_rocm -L/opt/rocm/lib -lhipblas -lamdhip64 -lrocblas
 import "C"

+ 1 - 2
ml/backend/ggml/ggml/ggml.go

@@ -5,8 +5,7 @@ package ggml
 // #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_CPU
 // #cgo darwin LDFLAGS: -framework Foundation
 // #cgo amd64,avx CPPFLAGS: -mavx
-// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma
-// #cgo amd64,f16c CPPFLAGS: -mf16c
+// #cgo amd64,avx2 CPPFLAGS: -mavx2 -mfma -mf16c
 // #cgo arm64 CPPFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 import "C"
 import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cpu"

+ 1 - 0
ml/backend/ggml/ggml/ggml_cuda.go

@@ -3,5 +3,6 @@
 package ggml
 
 // #cgo CPPFLAGS: -DGGML_USE_CUDA
+// #cgo rocm CPPFLAGS: -DGGML_USE_HIP
 import "C"
 import _ "github.com/ollama/ollama/ml/backend/ggml/ggml/ggml-cuda"

+ 1 - 3
scripts/build_darwin.sh

@@ -10,9 +10,7 @@ mkdir -p dist
 # If installed to an alternate location use the following to enable
 # export SDKROOT=/Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
 # export DEVELOPER_DIR=/Applications/Xcode_12.5.1.app/Contents/Developer
-export CGO_CFLAGS=-mmacosx-version-min=11.3
-export CGO_CXXFLAGS=-mmacosx-version-min=11.3
-export CGO_LDFLAGS=-mmacosx-version-min=11.3
+export CGO_CPPFLAGS=-mmacosx-version-min=11.3
 
 rm -rf llama/build dist/darwin-*
 echo "Building darwin arm64"