Makefile.rocm 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. # Build rules for ROCm runner
  2. #
  3. # Note: at present we only support a single ROCm version (whichever is default on the build system)
  4. # unlike CUDA where we'll build both a v11 and v12 variant.
  5. include make/common-defs.make
  6. include make/rocm-defs.make
  7. HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
  8. HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
  9. ifeq ($(OS),windows)
  10. GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
  11. CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
  12. HIP_ARCHS?=$(HIP_ARCHS_COMMON)
  13. GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
  14. GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
  15. else ifeq ($(OS),linux)
  16. GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
  17. CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
  18. HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
  19. GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
  20. GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
  21. endif
  22. GPU_COMPILER=$(HIP_COMPILER)
  23. # TODO future multi-variant support for ROCm
  24. # ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
  25. # ifneq (,$(ROCM_VERSION))
  26. # GPU_RUNNER_VARIANT = _v$(ROCM_VERSION)
  27. # endif
  28. GPU_RUNNER_GO_TAGS := rocm
  29. GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
  30. GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
  31. GPU_RUNNER_LIBS_SHORT := hipblas rocblas
  32. # Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
  33. ifeq ($(OS),windows)
  34. ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
  35. GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
  36. else ifeq ($(OS),linux)
  37. ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
  38. GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
  39. ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
  40. GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
  41. FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
  42. GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
  43. endif
  44. GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
  45. ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
  46. ifeq ($(OS),linux)
  47. GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++17
  48. else ifeq ($(OS),windows)
  49. GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
  50. endif
  51. GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
  52. # HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
  53. GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
  54. GPU_COMPILER_CUFLAGS = \
  55. $(GPU_COMPILER_FPIC) \
  56. $(addprefix -m,$(GPU_VECTOR_FLAGS)) \
  57. -mf16c \
  58. -mfma \
  59. -c \
  60. -O3 \
  61. -DGGML_USE_CUDA \
  62. -DGGML_BUILD=1 \
  63. -DGGML_BACKEND_BUILD=1 \
  64. -DGGML_SHARED=1 \
  65. -DGGML_BACKEND_SHARED=1 \
  66. -DGGML_CUDA_DMMV_X=32 \
  67. -DGGML_CUDA_MMV_Y=1 \
  68. -DGGML_SCHED_MAX_COPIES=4 \
  69. -DGGML_USE_HIP \
  70. -DGGML_USE_LLAMAFILE \
  71. -DHIP_FAST_MATH \
  72. -D__HIP_PLATFORM_AMD__=1 \
  73. -D__HIP_ROCclr__=1 \
  74. -DNDEBUG \
  75. -DK_QUANTS_PER_ITERATION=2 \
  76. -D_CRT_SECURE_NO_WARNINGS \
  77. -D_GNU_SOURCE \
  78. -D_XOPEN_SOURCE=600 \
  79. -DUSE_PROF_API=1 \
  80. -std=gnu++17 \
  81. -x hip \
  82. -mllvm=-amdgpu-early-inline-all=true \
  83. -mllvm=-amdgpu-function-calls=false \
  84. -Wno-expansion-to-defined \
  85. -Wno-invalid-noreturn \
  86. -Wno-ignored-attributes \
  87. -Wno-pass-failed \
  88. -Wno-deprecated-declarations \
  89. -Wno-unused-result \
  90. -I./llama/
  91. # Workaround buggy P2P copy on some windows multi-GPU setups
  92. # This workaround breaks linux systems with small system RAM, so only enable on windows
  93. ifeq ($(OS),windows)
  94. GPU_COMPILER_CUFLAGS += -DGGML_CUDA_NO_PEER_COPY=1
  95. endif
  96. include make/gpu.make
  97. # Adjust the rules from gpu.make to handle the ROCm dependencies properly
  98. $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
  99. $(ROCBLAS_DIST_DEP_MANIFEST):
  100. @-mkdir -p $(dir $@)
  101. @echo "Copying rocblas library..."
  102. (cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
  103. @echo "rocblas library copy complete"
  104. $(GPU_DIST_TRANSITIVE_LIB_DEPS):
  105. @-mkdir -p $(dir $@)
  106. $(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)