gen_linux.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. #!/bin/bash
  2. # This script is intended to run inside the go generate
  3. # working directory must be llm/generate/
  4. # First we build one or more CPU based LLM libraries
  5. #
  6. # Then if we detect CUDA, we build a CUDA dynamic library, and carry the required
  7. # library dependencies
  8. #
  9. # Then if we detect ROCm, we build a dynamically loaded ROCm lib. The ROCM
  10. # libraries are quite large, and also dynamically load data files at runtime
  11. # which in turn are large, so we don't attempt to cary them as payload
  12. set -ex
  13. set -o pipefail
  14. compress_pids=""
  15. # See https://llvm.org/docs/AMDGPUUsage.html#processors for reference
  16. amdGPUs() {
  17. if [ -n "${AMDGPU_TARGETS}" ]; then
  18. echo "${AMDGPU_TARGETS}"
  19. return
  20. fi
  21. GPU_LIST=(
  22. "gfx900"
  23. "gfx906:xnack-"
  24. "gfx908:xnack-"
  25. "gfx90a:xnack+"
  26. "gfx90a:xnack-"
  27. "gfx940"
  28. "gfx941"
  29. "gfx942"
  30. "gfx1010"
  31. "gfx1012"
  32. "gfx1030"
  33. "gfx1100"
  34. "gfx1101"
  35. "gfx1102"
  36. )
  37. (
  38. IFS=$';'
  39. echo "'${GPU_LIST[*]}'"
  40. )
  41. }
  42. echo "Starting linux generate script"
  43. if [ -z "${CUDACXX}" ]; then
  44. if [ -x /usr/local/cuda/bin/nvcc ]; then
  45. export CUDACXX=/usr/local/cuda/bin/nvcc
  46. else
  47. # Try the default location in case it exists
  48. export CUDACXX=$(command -v nvcc)
  49. fi
  50. fi
  51. COMMON_CMAKE_DEFS="-DCMAKE_SKIP_RPATH=on -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
  52. source $(dirname $0)/gen_common.sh
  53. init_vars
  54. git_module_setup
  55. apply_patches
  56. init_vars
  57. if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
  58. # Users building from source can tune the exact flags we pass to cmake for configuring
  59. # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
  60. if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
  61. init_vars
  62. echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
  63. CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
  64. RUNNER="cpu"
  65. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  66. echo "Building custom CPU"
  67. build
  68. install
  69. dist
  70. compress
  71. else
  72. # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
  73. # -DGGML_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
  74. # -DGGML_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
  75. # -DGGML_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
  76. # -DGGML_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
  77. # Note: the following seem to yield slower results than AVX2 - ymmv
  78. # -DGGML_AVX512 -- 2017 Intel Skylake and High End DeskTop (HEDT)
  79. # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
  80. # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
  81. COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
  82. if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
  83. #
  84. # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
  85. #
  86. init_vars
  87. CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
  88. RUNNER=cpu
  89. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  90. echo "Building LCD CPU"
  91. build
  92. install
  93. dist
  94. compress
  95. fi
  96. if [ "${ARCH}" == "x86_64" ]; then
  97. #
  98. # ARM chips in M1/M2/M3-based MACs and NVidia Tegra devices do not currently support avx extensions.
  99. #
  100. if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx" ]; then
  101. #
  102. # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
  103. # Approximately 400% faster than LCD on same CPU
  104. #
  105. init_vars
  106. CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
  107. RUNNER=cpu_avx
  108. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  109. echo "Building AVX CPU"
  110. build
  111. install
  112. dist
  113. compress
  114. fi
  115. if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
  116. #
  117. # ~2013 CPU Dynamic library
  118. # Approximately 10% faster than AVX on same CPU
  119. #
  120. init_vars
  121. CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
  122. RUNNER=cpu_avx2
  123. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  124. echo "Building AVX2 CPU"
  125. build
  126. install
  127. dist
  128. compress
  129. fi
  130. fi
  131. fi
  132. else
  133. echo "Skipping CPU generation step as requested"
  134. fi
  135. # If needed, look for the default CUDA toolkit location
  136. if [ -z "${CUDA_LIB_DIR}" ] && [ -d /usr/local/cuda/lib64 ]; then
  137. CUDA_LIB_DIR=/usr/local/cuda/lib64
  138. fi
  139. # If needed, look for CUDA on Arch Linux
  140. if [ -z "${CUDA_LIB_DIR}" ] && [ -d /opt/cuda/targets/x86_64-linux/lib ]; then
  141. CUDA_LIB_DIR=/opt/cuda/targets/x86_64-linux/lib
  142. fi
  143. # Allow override in case libcudart is in the wrong place
  144. if [ -z "${CUDART_LIB_DIR}" ]; then
  145. CUDART_LIB_DIR="${CUDA_LIB_DIR}"
  146. fi
  147. if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
  148. echo "CUDA libraries detected - building dynamic CUDA library"
  149. init_vars
  150. CUDA_MAJOR=$(ls "${CUDA_LIB_DIR}"/libcudart.so.* | head -1 | cut -f3 -d. || true)
  151. if [ -n "${CUDA_MAJOR}" -a -z "${CUDA_VARIANT}" ]; then
  152. CUDA_VARIANT=_v${CUDA_MAJOR}
  153. fi
  154. if [ "${ARCH}" == "arm64" ]; then
  155. echo "ARM CPU detected - disabling unsupported AVX instructions"
  156. # ARM-based CPUs such as M1 and Tegra do not support AVX extensions.
  157. #
  158. # CUDA compute < 6.0 lacks proper FP16 support on ARM.
  159. # Disabling has minimal performance effect while maintaining compatibility.
  160. ARM64_DEFS="-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_CUDA_F16=off"
  161. fi
  162. # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
  163. if [ -n "${OLLAMA_CUSTOM_CUDA_DEFS}" ]; then
  164. echo "OLLAMA_CUSTOM_CUDA_DEFS=\"${OLLAMA_CUSTOM_CUDA_DEFS}\""
  165. CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${OLLAMA_CUSTOM_CUDA_DEFS}"
  166. echo "Building custom CUDA GPU"
  167. else
  168. CMAKE_CUDA_DEFS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
  169. fi
  170. export CUDAFLAGS="-t8"
  171. CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
  172. RUNNER=cuda${CUDA_VARIANT}
  173. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  174. export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
  175. CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
  176. build
  177. install
  178. dist
  179. echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
  180. mkdir -p "${CUDA_DIST_DIR}"
  181. for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
  182. cp -a "${lib}" "${CUDA_DIST_DIR}"
  183. done
  184. compress
  185. fi
  186. if [ -z "${ONEAPI_ROOT}" ]; then
  187. # Try the default location in case it exists
  188. ONEAPI_ROOT=/opt/intel/oneapi
  189. fi
  190. if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
  191. echo "OneAPI libraries detected - building dynamic OneAPI library"
  192. init_vars
  193. source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
  194. CC=icx
  195. CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
  196. RUNNER=oneapi
  197. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  198. ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
  199. export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
  200. DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
  201. build
  202. # copy oneAPI dependencies
  203. mkdir -p "${ONEAPI_DIST_DIR}"
  204. for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
  205. cp -a "${dep}" "${ONEAPI_DIST_DIR}"
  206. done
  207. cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${ONEAPI_DIST_DIR}"
  208. cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${ONEAPI_DIST_DIR}"
  209. cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${ONEAPI_DIST_DIR}"
  210. cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${ONEAPI_DIST_DIR}"
  211. cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${ONEAPI_DIST_DIR}"
  212. cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
  213. cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
  214. install
  215. dist
  216. compress
  217. fi
  218. if [ -z "${ROCM_PATH}" ]; then
  219. # Try the default location in case it exists
  220. ROCM_PATH=/opt/rocm
  221. fi
  222. if [ -z "${CLBlast_DIR}" ]; then
  223. # Try the default location in case it exists
  224. if [ -d /usr/lib/cmake/CLBlast ]; then
  225. export CLBlast_DIR=/usr/lib/cmake/CLBlast
  226. fi
  227. fi
  228. if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
  229. echo "ROCm libraries detected - building dynamic ROCm library"
  230. if [ -f ${ROCM_PATH}/lib/librocblas.so.*.*.????? ]; then
  231. ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
  232. fi
  233. init_vars
  234. CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
  235. # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
  236. if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
  237. echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
  238. CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
  239. echo "Building custom ROCM GPU"
  240. fi
  241. RUNNER=rocm${ROCM_VARIANT}
  242. BUILD_DIR="../build/linux/${GOARCH}/${RUNNER}"
  243. # ROCm dependencies are too large to fit into a unified bundle
  244. ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
  245. # TODO figure out how to disable runpath (rpath)
  246. # export CMAKE_HIP_FLAGS="-fno-rtlib-add-rpath" # doesn't work
  247. export LLAMA_SERVER_LDFLAGS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
  248. build
  249. # copy the ROCM dependencies
  250. mkdir -p "${ROCM_DIST_DIR}"
  251. for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${GOARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
  252. cp -a "${dep}"* "${ROCM_DIST_DIR}"
  253. if [ $(readlink -f "${dep}") != "${dep}" ] ; then
  254. cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
  255. fi
  256. done
  257. install
  258. dist
  259. compress
  260. fi
  261. cleanup
  262. wait_for_compress
  263. echo "go generate completed. LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"