Procházet zdrojové kódy

Add cuda v12 variant and selection logic

Based on compute capability and driver version, pick
v12 or v11 cuda variants.
Daniel Hiltgen před 10 měsíci
rodič
revize
4fe3a556fa
4 změnil soubory, kde provedl 84 přidání a 48 odebrání
  1. 33 10
      Dockerfile
  2. 43 0
      gpu/cuda_common.go
  3. 4 36
      gpu/gpu.go
  4. 4 2
      gpu/types.go

+ 33 - 10
Dockerfile

@@ -1,7 +1,7 @@
 ARG GOLANG_VERSION=1.22.5
 ARG CMAKE_VERSION=3.22.1
-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
-ARG CUDA_VERSION=11.3.1
+ARG CUDA_VERSION_11=11.3.1
+ARG CUDA_VERSION_12=12.4.0
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -13,7 +13,7 @@ COPY .git .git
 COPY .gitmodules .gitmodules
 COPY llm llm
 
-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH amd64 
 RUN --mount=type=cache,target=/root/.ccache \
-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
+    CUDA_VARIANT="_v11" \
+    bash gen_linux.sh
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
+ARG CMAKE_VERSION
+COPY ./scripts/rh_linux_deps.sh /
+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
+ARG CGO_CFLAGS
+ENV GOARCH amd64 
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 \
+    OLLAMA_SKIP_CPU_GENERATE=1 \
+    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
+    CUDA_VARIANT="_v12" \
+    bash gen_linux.sh
+
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 ARG CGO_CFLAGS
 ENV GOARCH arm64 
-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
+RUN --mount=type=cache,target=/root/.ccache \
+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
 
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
 ARG CMAKE_VERSION
@@ -139,8 +160,10 @@ COPY . .
 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ARG GOFLAGS
@@ -155,8 +178,8 @@ ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 ## arm binary += 381M 
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/

+ 43 - 0
gpu/cuda_common.go

@@ -4,9 +4,17 @@ package gpu
 
 import (
 	"log/slog"
+	"os"
+	"regexp"
+	"runtime"
+	"strconv"
 	"strings"
 )
 
+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
+var CudaTegra string = os.Getenv("JETSON_JETPACK")
+
 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	ids := []string{}
 	for _, info := range gpuInfo {
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 	}
 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
 }
+
+func cudaGetVariant(gpuInfo CudaGPUInfo) string {
+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
+		if CudaTegra != "" {
+			ver := strings.Split(CudaTegra, ".")
+			if len(ver) > 0 {
+				return "jetpack" + ver[0]
+			}
+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
+			r := regexp.MustCompile(` R(\d+) `)
+			m := r.FindSubmatch(data)
+			if len(m) != 2 {
+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
+			} else {
+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
+					// https://developer.nvidia.com/embedded/jetpack-archive
+					switch l4t {
+					case 35:
+						return "jetpack5"
+					case 36:
+						return "jetpack6"
+					default:
+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
+					}
+				}
+			}
+		}
+	}
+
+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
+		return "v11"
+	}
+	return "v12"
+}

+ 4 - 36
gpu/gpu.go

@@ -15,9 +15,7 @@ import (
 	"log/slog"
 	"os"
 	"path/filepath"
-	"regexp"
 	"runtime"
-	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -66,10 +64,6 @@ var RocmComputeMin = 9
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
 
-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
-var CudaTegra string = os.Getenv("JETSON_JETPACK")
-
 // Note: gpuMutex must already be held
 func initCudaHandles() *cudaHandles {
 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
@@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
 
 		depPath := GetDepDir()
 
-		var cudaVariant string
-		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
-			if CudaTegra != "" {
-				ver := strings.Split(CudaTegra, ".")
-				if len(ver) > 0 {
-					cudaVariant = "jetpack" + ver[0]
-				}
-			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
-				r := regexp.MustCompile(` R(\d+) `)
-				m := r.FindSubmatch(data)
-				if len(m) != 2 {
-					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
-				} else {
-					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
-						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
-						// https://developer.nvidia.com/embedded/jetpack-archive
-						switch l4t {
-						case 35:
-							cudaVariant = "jetpack5"
-						case 36:
-							cudaVariant = "jetpack6"
-						default:
-							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
-						}
-					}
-				}
-			}
-		}
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo := CudaGPUInfo{
 					GpuInfo: GpuInfo{
 						Library: "cuda",
-						Variant: cudaVariant,
 					},
 					index: i,
 				}
@@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+				gpuInfo.computeMajor = int(memInfo.major)
+				gpuInfo.computeMinor = int(memInfo.minor)
 				gpuInfo.MinimumMemory = cudaMinimumMemory
+				cudaVariant := cudaGetVariant(gpuInfo)
 				if depPath != "" {
 					gpuInfo.DependencyPath = depPath
 					// Check for variant specific directory
@@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
+				gpuInfo.Variant = cudaGetVariant(gpuInfo)
 
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates

+ 4 - 2
gpu/types.go

@@ -53,8 +53,10 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	OSOverhead uint64 // Memory overhead between the driver library and management library
-	index      int    //nolint:unused,nolintlint
+	OSOverhead   uint64 // Memory overhead between the driver library and management library
+	index        int    //nolint:unused,nolintlint
+	computeMajor int    //nolint:unused,nolintlint
+	computeMinor int    //nolint:unused,nolintlint
 }
 type CudaGPUInfoList []CudaGPUInfo