10 months ago · 4fe3a556fa
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 
				 ARG GOLANG_VERSION=1.22.5
			
 
				 ARG CMAKE_VERSION=3.22.1
			
 
				-# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
			
 
				-ARG CUDA_VERSION=11.3.1
			
 
				+ARG CUDA_VERSION_11=11.3.1
			
 
				+ARG CUDA_VERSION_12=12.4.0
			
 
				 ARG ROCM_VERSION=6.1.2
			
 
				 ARG JETPACK_6=r36.2.0
			
 
				 ARG JETPACK_5=r35.4.1
			
@@ -13,7 +13,7 @@ COPY .git .git
 
				 COPY .gitmodules .gitmodules
			
 
				 COPY llm llm
			
 
				 
			
 
				-FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION-devel-centos7 AS cuda-build-amd64
			
 
				+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_11-devel-centos7 AS cuda-11-build-amd64
			
 
				 ARG CMAKE_VERSION
			
 
				 COPY ./scripts/rh_linux_deps.sh /
			
 
				 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
			
@@ -23,9 +23,29 @@ WORKDIR /go/src/github.com/ollama/ollama/llm/generate
 
				 ARG CGO_CFLAGS
			
 
				 ENV GOARCH amd64 
			
 
				 RUN --mount=type=cache,target=/root/.ccache \
			
 
				-    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
			
 
				+    OLLAMA_SKIP_STATIC_GENERATE=1 \
			
 
				+    OLLAMA_SKIP_CPU_GENERATE=1 \
			
 
				+    CMAKE_CUDA_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86" \
			
 
				+    CUDA_VARIANT="_v11" \
			
 
				+    bash gen_linux.sh
			
 
				 
			
 
				-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 AS cuda-build-server-arm64
			
 
				+FROM --platform=linux/amd64 nvidia/cuda:$CUDA_VERSION_12-devel-centos7 AS cuda-12-build-amd64
			
 
				+ARG CMAKE_VERSION
			
 
				+COPY ./scripts/rh_linux_deps.sh /
			
 
				+RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
			
 
				+ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
			
 
				+COPY --from=llm-code / /go/src/github.com/ollama/ollama/
			
 
				+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
			
 
				+ARG CGO_CFLAGS
			
 
				+ENV GOARCH amd64 
			
 
				+RUN --mount=type=cache,target=/root/.ccache \
			
 
				+    OLLAMA_SKIP_STATIC_GENERATE=1 \
			
 
				+    OLLAMA_SKIP_CPU_GENERATE=1 \
			
 
				+    CMAKE_CUDA_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a" \
			
 
				+    CUDA_VARIANT="_v12" \
			
 
				+    bash gen_linux.sh
			
 
				+
			
 
				+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
			
 
				 ARG CMAKE_VERSION
			
 
				 COPY ./scripts/rh_linux_deps.sh /
			
 
				 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
			
@@ -34,7 +54,8 @@ COPY --from=llm-code / /go/src/github.com/ollama/ollama/
 
				 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
			
 
				 ARG CGO_CFLAGS
			
 
				 ENV GOARCH arm64 
			
 
				-RUN OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
			
 
				+RUN --mount=type=cache,target=/root/.ccache \
			
 
				+    OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_SKIP_CPU_GENERATE=1 bash gen_linux.sh
			
 
				 
			
 
				 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_6} AS cuda-build-jetpack6-arm64
			
 
				 ARG CMAKE_VERSION
			
@@ -139,8 +160,10 @@ COPY . .
 
				 COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 ARG GOFLAGS
			
@@ -155,8 +178,8 @@ ARG GOLANG_VERSION
 
				 WORKDIR /go/src/github.com/ollama/ollama
			
 
				 COPY . .
			
 
				 COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				-COPY --from=cuda-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
			
 
				+COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 ## arm binary += 381M 
			
 
				 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 COPY --from=cuda-build-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
			
--- a/gpu/cuda_common.go
+++ b/gpu/cuda_common.go
@@ -4,9 +4,17 @@ package gpu
 
				 
			
 
				 import (
			
 
				 	"log/slog"
			
 
				+	"os"
			
 
				+	"regexp"
			
 
				+	"runtime"
			
 
				+	"strconv"
			
 
				 	"strings"
			
 
				 )
			
 
				 
			
 
				+// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
			
 
				+// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
			
 
				+var CudaTegra string = os.Getenv("JETSON_JETPACK")
			
 
				+
			
 
				 func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
			
 
				 	ids := []string{}
			
 
				 	for _, info := range gpuInfo {
			
@@ -19,3 +27,38 @@ func cudaGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
 
				 	}
			
 
				 	return "CUDA_VISIBLE_DEVICES", strings.Join(ids, ",")
			
 
				 }
			
 
				+
			
 
				+func cudaGetVariant(gpuInfo CudaGPUInfo) string {
			
 
				+	if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
			
 
				+		if CudaTegra != "" {
			
 
				+			ver := strings.Split(CudaTegra, ".")
			
 
				+			if len(ver) > 0 {
			
 
				+				return "jetpack" + ver[0]
			
 
				+			}
			
 
				+		} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
			
 
				+			r := regexp.MustCompile(` R(\d+) `)
			
 
				+			m := r.FindSubmatch(data)
			
 
				+			if len(m) != 2 {
			
 
				+				slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
			
 
				+			} else {
			
 
				+				if l4t, err := strconv.Atoi(string(m[1])); err == nil {
			
 
				+					// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
			
 
				+					// https://developer.nvidia.com/embedded/jetpack-archive
			
 
				+					switch l4t {
			
 
				+					case 35:
			
 
				+						return "jetpack5"
			
 
				+					case 36:
			
 
				+						return "jetpack6"
			
 
				+					default:
			
 
				+						slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if gpuInfo.computeMajor < 6 || gpuInfo.DriverMajor < 12 {
			
 
				+		return "v11"
			
 
				+	}
			
 
				+	return "v12"
			
 
				+}
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -15,9 +15,7 @@ import (
 
				 	"log/slog"
			
 
				 	"os"
			
 
				 	"path/filepath"
			
 
				-	"regexp"
			
 
				 	"runtime"
			
 
				-	"strconv"
			
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
@@ -66,10 +64,6 @@ var RocmComputeMin = 9
 
				 // TODO find a better way to detect iGPU instead of minimum memory
			
 
				 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
			
 
				 
			
 
				-// Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
			
 
				-// Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
			
 
				-var CudaTegra string = os.Getenv("JETSON_JETPACK")
			
 
				-
			
 
				 // Note: gpuMutex must already be held
			
 
				 func initCudaHandles() *cudaHandles {
			
 
				 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing
			
@@ -233,35 +227,6 @@ func GetGPUInfo() GpuInfoList {
 
				 
			
 
				 		depPath := GetDepDir()
			
 
				 
			
 
				-		var cudaVariant string
			
 
				-		if runtime.GOARCH == "arm64" && runtime.GOOS == "linux" {
			
 
				-			if CudaTegra != "" {
			
 
				-				ver := strings.Split(CudaTegra, ".")
			
 
				-				if len(ver) > 0 {
			
 
				-					cudaVariant = "jetpack" + ver[0]
			
 
				-				}
			
 
				-			} else if data, err := os.ReadFile("/etc/nv_tegra_release"); err == nil {
			
 
				-				r := regexp.MustCompile(` R(\d+) `)
			
 
				-				m := r.FindSubmatch(data)
			
 
				-				if len(m) != 2 {
			
 
				-					slog.Info("Unexpected format for /etc/nv_tegra_release.  Set JETSON_JETPACK to select version")
			
 
				-				} else {
			
 
				-					if l4t, err := strconv.Atoi(string(m[1])); err == nil {
			
 
				-						// Note: mapping from L4t -> JP is inconsistent (can't just subtract 30)
			
 
				-						// https://developer.nvidia.com/embedded/jetpack-archive
			
 
				-						switch l4t {
			
 
				-						case 35:
			
 
				-							cudaVariant = "jetpack5"
			
 
				-						case 36:
			
 
				-							cudaVariant = "jetpack6"
			
 
				-						default:
			
 
				-							slog.Info("unsupported L4T version", "nv_tegra_release", string(data))
			
 
				-						}
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				 		// Load ALL libraries
			
 
				 		cHandles = initCudaHandles()
			
 
				 
			
@@ -271,7 +236,6 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo := CudaGPUInfo{
			
 
				 					GpuInfo: GpuInfo{
			
 
				 						Library: "cuda",
			
 
				-						Variant: cudaVariant,
			
 
				 					},
			
 
				 					index: i,
			
 
				 				}
			
@@ -297,7 +261,10 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo.FreeMemory = uint64(memInfo.free)
			
 
				 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
			
 
				 				gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
			
 
				+				gpuInfo.computeMajor = int(memInfo.major)
			
 
				+				gpuInfo.computeMinor = int(memInfo.minor)
			
 
				 				gpuInfo.MinimumMemory = cudaMinimumMemory
			
 
				+				cudaVariant := cudaGetVariant(gpuInfo)
			
 
				 				if depPath != "" {
			
 
				 					gpuInfo.DependencyPath = depPath
			
 
				 					// Check for variant specific directory
			
@@ -310,6 +277,7 @@ func GetGPUInfo() GpuInfoList {
 
				 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
			
 
				 				gpuInfo.DriverMajor = driverMajor
			
 
				 				gpuInfo.DriverMinor = driverMinor
			
 
				+				gpuInfo.Variant = cudaGetVariant(gpuInfo)
			
 
				 
			
 
				 				// query the management library as well so we can record any skew between the two
			
 
				 				// which represents overhead on the GPU we must set aside on subsequent updates
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -53,8 +53,10 @@ type CPUInfo struct {
 
				 
			
 
				 type CudaGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	OSOverhead uint64 // Memory overhead between the driver library and management library
			
 
				-	index      int    //nolint:unused,nolintlint
			
 
				+	OSOverhead   uint64 // Memory overhead between the driver library and management library
			
 
				+	index        int    //nolint:unused,nolintlint
			
 
				+	computeMajor int    //nolint:unused,nolintlint
			
 
				+	computeMinor int    //nolint:unused,nolintlint
			
 
				 }
			
 
				 type CudaGPUInfoList []CudaGPUInfo