Browse Source

Optimize container images for startup (#6547)

* Optimize container images for startup

This change adjusts how to handle runner payloads to support
container builds where we keep them extracted in the filesystem.
This makes it easier to optimize the cpu/cuda vs cpu/rocm images for
size, and should result in faster startup times for container images.

* Refactor payload logic and add buildx support for faster builds

* Move payloads around

* Review comments

* Converge to buildx based helper scripts

* Use docker buildx action for release
Daniel Hiltgen 7 months ago
parent
commit
cd5c8f6471

+ 2 - 0
.dockerignore

@@ -7,3 +7,5 @@ llm/llama.cpp
 .env
 .env
 .cache
 .cache
 test_data
 test_data
+llm/build
+llama/build

+ 180 - 29
.github/workflows/release.yaml

@@ -102,8 +102,8 @@ jobs:
         with:
         with:
           name: generate-windows-cpu
           name: generate-windows-cpu
           path: |
           path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
+            build/**/*
+            build/**/*.a
             dist/windows-amd64/**
             dist/windows-amd64/**
 
 
   # ROCm generation step
   # ROCm generation step
@@ -176,7 +176,7 @@ jobs:
         with:
         with:
           name: generate-windows-rocm
           name: generate-windows-rocm
           path: |
           path: |
-            llm/build/**/bin/*
+            build/**/*
             dist/windows-amd64/**
             dist/windows-amd64/**
       - uses: actions/upload-artifact@v4
       - uses: actions/upload-artifact@v4
         with:
         with:
@@ -265,7 +265,7 @@ jobs:
         with:
         with:
           name: generate-windows-cuda-${{ matrix.cuda.version }}
           name: generate-windows-cuda-${{ matrix.cuda.version }}
           path: |
           path: |
-            llm/build/**/bin/*
+            build/**/*
             dist/windows-amd64/**
             dist/windows-amd64/**
       - uses: actions/upload-artifact@v4
       - uses: actions/upload-artifact@v4
         with:
         with:
@@ -338,7 +338,7 @@ jobs:
       - uses: actions/download-artifact@v4
       - uses: actions/download-artifact@v4
         with:
         with:
           name: generate-windows-rocm
           name: generate-windows-rocm
-      - run: dir llm/build
+      - run: dir build
       - run: |
       - run: |
           $gopath=(get-command go).source | split-path -parent
           $gopath=(get-command go).source | split-path -parent
           & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
           & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
@@ -359,9 +359,7 @@ jobs:
     environment: release
     environment: release
     runs-on: linux
     runs-on: linux
     env:
     env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
       BUILD_ARCH: amd64
       BUILD_ARCH: amd64
-      PUSH: '1'
     steps:
     steps:
       - uses: actions/checkout@v4
       - uses: actions/checkout@v4
         with:
         with:
@@ -369,14 +367,8 @@ jobs:
       - name: Set Version
       - name: Set Version
         shell: bash
         shell: bash
         run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
         run: echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
       - run: |
       - run: |
           ./scripts/build_linux.sh
           ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
       - uses: actions/upload-artifact@v4
       - uses: actions/upload-artifact@v4
         with:
         with:
           name: dist-linux-amd64
           name: dist-linux-amd64
@@ -390,9 +382,7 @@ jobs:
     environment: release
     environment: release
     runs-on: linux-arm64
     runs-on: linux-arm64
     env:
     env:
-      OLLAMA_SKIP_MANIFEST_CREATE: '1'
       BUILD_ARCH: arm64
       BUILD_ARCH: arm64
-      PUSH: '1'
     steps:
     steps:
       - uses: actions/checkout@v4
       - uses: actions/checkout@v4
         with:
         with:
@@ -421,14 +411,8 @@ jobs:
           sudo usermod -aG docker $USER
           sudo usermod -aG docker $USER
           sudo apt-get install acl
           sudo apt-get install acl
           sudo setfacl --modify user:$USER:rw /var/run/docker.sock
           sudo setfacl --modify user:$USER:rw /var/run/docker.sock
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
       - run: |
       - run: |
           ./scripts/build_linux.sh
           ./scripts/build_linux.sh
-          ./scripts/build_docker.sh
       - uses: actions/upload-artifact@v4
       - uses: actions/upload-artifact@v4
         with:
         with:
           name: dist-linux-arm64
           name: dist-linux-arm64
@@ -436,6 +420,181 @@ jobs:
             dist/*linux*
             dist/*linux*
             !dist/*-cov
             !dist/*-cov
 
 
+  # Container image build
+  build-linux:
+    environment: release
+    strategy:
+      matrix:
+        runner:
+          - linux
+          - linux-arm64
+    runs-on: ${{ matrix.runner }}
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: 'Install Docker'
+        if: ${{ startsWith(matrix.runner, 'linux-arm64') }}
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y ca-certificates curl
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+          echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt-get update
+          sudo apt-get install -y docker-ce docker-ce-cli containerd.io
+          sudo usermod -aG docker $USER
+          sudo apt-get install acl
+          sudo setfacl --modify user:$USER:rw /var/run/docker.sock
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          machine=$(uname -m)
+          case ${machine} in
+            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
+            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
+          esac >>$GITHUB_ENV
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Build and push by digest
+        id: build
+        uses: docker/build-push-action@v6
+        with:
+          context: "."
+          platforms: linux/${{ env.ARCH }}
+          build-args: |
+            GOFLAGS
+          outputs: type=image,name=${{ env.FINAL_IMAGE_REPO }},push-by-digest=true,name-canonical=true,push=true
+      - name: Export digest
+        run: |
+          mkdir -p /tmp/digests
+          digest="${{ steps.build.outputs.digest }}"
+          touch "/tmp/digests/${digest#sha256:}"
+      - name: Upload digest
+        uses: actions/upload-artifact@v4
+        with:
+          name: digests-${{ env.PLATFORM_PAIR }}
+          path: /tmp/digests/*
+          if-no-files-found: error
+          retention-days: 1
+  merge:
+    environment: release
+    runs-on: linux
+    needs:
+      - build-linux
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp/digests
+          pattern: digests-*
+          merge-multiple: true
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          machine=$(uname -m)
+          case ${machine} in
+            x86_64) echo ARCH=amd64; echo PLATFORM_PAIR=linux-amd64 ;;
+            aarch64) echo ARCH=arm64; echo PLATFORM_PAIR=linux-arm64 ;;
+          esac >>$GITHUB_ENV
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Create manifest list and push
+        working-directory: /tmp/digests
+        run: |
+          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
+            $(printf '${{ env.FINAL_IMAGE_REPO }}@sha256:%s ' *)
+      - name: Inspect image
+        run: |
+          docker buildx imagetools inspect ${{ env.FINAL_IMAGE_REPO }}:${{ steps.meta.outputs.version }}          
+  build-linux-rocm:
+    environment: release
+    runs-on: linux
+    env:
+      FINAL_IMAGE_REPO: ollama/ollama
+      ARCH: amd64
+      PLATFORM_PAIR: linux-amd64
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FINAL_IMAGE_REPO }}
+          flavor: |
+            latest=false
+          tags: |
+            type=ref,event=tag
+            type=ref,enable=true,priority=600,prefix=0.0.0-pr,suffix=,event=pr
+            type=semver,pattern={{version}}
+      - name: Set Version
+        shell: bash
+        run: |
+          echo GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=${{ env.DOCKER_METADATA_OUTPUT_VERSION }}\" \"-X=github.com/ollama/ollama/server.mode=release\"'" >>$GITHUB_ENV
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      - name: Build and push by digest
+        id: build
+        uses: docker/build-push-action@v6
+        with:
+          context: "."
+          target: runtime-rocm
+          build-args: |
+            GOFLAGS
+          tags: ${{ env.FINAL_IMAGE_REPO }}:${{ env.DOCKER_METADATA_OUTPUT_VERSION}}-rocm,${{ env.FINAL_IMAGE_REPO }}:rocm
+          push: true
+
   # Aggregate all the assets and ship a release
   # Aggregate all the assets and ship a release
   release:
   release:
     needs:
     needs:
@@ -448,8 +607,6 @@ jobs:
     permissions:
     permissions:
       contents: write
       contents: write
     env:
     env:
-      OLLAMA_SKIP_IMAGE_BUILD: '1'
-      PUSH: '1'
       GH_TOKEN: ${{ github.token }}
       GH_TOKEN: ${{ github.token }}
     steps:
     steps:
       - uses: actions/checkout@v4
       - uses: actions/checkout@v4
@@ -458,12 +615,6 @@ jobs:
         run: |
         run: |
           echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
           echo "VERSION=${GITHUB_REF_NAME#v}" >> $GITHUB_ENV
           echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
           echo "RELEASE_VERSION=$(echo ${GITHUB_REF_NAME} | cut -f1 -d-)" >> $GITHUB_ENV
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ vars.DOCKER_USER }}
-          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
-      - run: ./scripts/build_docker.sh
       - name: Retrieve built artifact
       - name: Retrieve built artifact
         uses: actions/download-artifact@v4
         uses: actions/download-artifact@v4
         with:
         with:

+ 1 - 42
.github/workflows/test.yaml

@@ -81,12 +81,6 @@ jobs:
         if: ${{ ! startsWith(matrix.os, 'windows-') }}
         if: ${{ ! startsWith(matrix.os, 'windows-') }}
         name: 'Unix Go Generate'
         name: 'Unix Go Generate'
       - run: go build .
       - run: go build .
-      - uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
-          path: |
-            llm/build/**/bin/*
-            llm/build/**/*.a
   generate-cuda:
   generate-cuda:
     needs: [changes]
     needs: [changes]
     if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
     if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
@@ -114,12 +108,6 @@ jobs:
           go generate -x ./...
           go generate -x ./...
         env:
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
           OLLAMA_SKIP_CPU_GENERATE: '1'
-      - uses: actions/upload-artifact@v4
-        with:
-          name: cuda-${{ matrix.cuda-version }}-libraries
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**
   generate-rocm:
   generate-rocm:
     needs: [changes]
     needs: [changes]
     if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
     if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
@@ -147,12 +135,6 @@ jobs:
           go generate -x ./...
           go generate -x ./...
         env:
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
           OLLAMA_SKIP_CPU_GENERATE: '1'
-      - uses: actions/upload-artifact@v4
-        with:
-          name: rocm-${{ matrix.rocm-version }}-libraries
-          path: |
-            llm/build/**/bin/*
-            dist/windows-amd64/**
 
 
   # ROCm generation step
   # ROCm generation step
   generate-windows-rocm:
   generate-windows-rocm:
@@ -189,7 +171,6 @@ jobs:
         name: go generate
         name: go generate
         env:
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
           OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?
 
 
   # CUDA generation step
   # CUDA generation step
   generate-windows-cuda:
   generate-windows-cuda:
@@ -231,7 +212,6 @@ jobs:
           go generate -x ./...
           go generate -x ./...
         env:
         env:
           OLLAMA_SKIP_CPU_GENERATE: '1'
           OLLAMA_SKIP_CPU_GENERATE: '1'
-      # TODO - do we need any artifacts?
 
 
   lint:
   lint:
     strategy:
     strategy:
@@ -263,14 +243,6 @@ jobs:
             arm64) echo ARCH=arm64 ;;
             arm64) echo ARCH=arm64 ;;
           esac >>$GITHUB_ENV
           esac >>$GITHUB_ENV
         shell: bash
         shell: bash
-      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
-      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'macos-') }}
       - uses: golangci/golangci-lint-action@v6
       - uses: golangci/golangci-lint-action@v6
         with:
         with:
           args: --timeout 8m0s -v
           args: --timeout 8m0s -v
@@ -301,23 +273,10 @@ jobs:
           cache: true
           cache: true
       - run: |
       - run: |
           case ${{ matrix.arch }} in
           case ${{ matrix.arch }} in
-            amd64) echo ARCH=x86_64 ;;
+            amd64) echo ARCH=amd64 ;;
             arm64) echo ARCH=arm64 ;;
             arm64) echo ARCH=arm64 ;;
           esac >>$GITHUB_ENV
           esac >>$GITHUB_ENV
         shell: bash
         shell: bash
-      - run: |
-          mkdir -p llm/build/linux/$ARCH/stub/bin
-          touch llm/build/linux/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'ubuntu-') }}
-      - run: |
-          mkdir -p llm/build/darwin/$ARCH/stub/bin
-          touch llm/build/darwin/$ARCH/stub/bin/ollama_llama_server
-        if: ${{ startsWith(matrix.os, 'macos-') }}
-        shell: bash
       - run: go generate ./...
       - run: go generate ./...
       - run: go build
       - run: go build
       - run: go test -v ./...
       - run: go test -v ./...
-      - uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.os }}-binaries
-          path: ollama

+ 3 - 0
.gitignore

@@ -12,4 +12,7 @@ ggml-metal.metal
 test_data
 test_data
 *.crt
 *.crt
 llm/build
 llm/build
+build/*/*/*
+!build/**/placeholder
+llama/build
 __debug_bin*
 __debug_bin*

+ 70 - 31
Dockerfile

@@ -47,7 +47,7 @@ RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     OLLAMA_CUSTOM_CUDA_DEFS="-DGGML_CUDA_USE_GRAPHS=on" \
     bash gen_linux.sh
     bash gen_linux.sh
 
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_11-devel-rockylinux8 AS cuda-11-build-runner-arm64
 ARG CMAKE_VERSION
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -63,7 +63,7 @@ RUN OLLAMA_SKIP_STATIC_GENERATE=1 \
     CUDA_VARIANT="_v11" \
     CUDA_VARIANT="_v11" \
     bash gen_linux.sh
     bash gen_linux.sh
 
 
-FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-server-arm64
+FROM --platform=linux/arm64 nvidia/cuda:$CUDA_VERSION_12-devel-rockylinux8 AS cuda-12-build-runner-arm64
 ARG CMAKE_VERSION
 ARG CMAKE_VERSION
 COPY ./scripts/rh_linux_deps.sh /
 COPY ./scripts/rh_linux_deps.sh /
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
 RUN CMAKE_VERSION=${CMAKE_VERSION} sh /rh_linux_deps.sh
@@ -143,64 +143,103 @@ RUN --mount=type=cache,target=/root/.ccache \
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
     OLLAMA_SKIP_STATIC_GENERATE=1 OLLAMA_CPU_TARGET="cpu" bash gen_linux.sh
 
 
 
 
-# Intermediate stage used for ./scripts/build_linux.sh
+# Intermediate stages used for ./scripts/build_linux.sh
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
 ENV CGO_ENABLED=1
 ENV CGO_ENABLED=1
 WORKDIR /go/src/github.com/ollama/ollama
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY . .
-COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG GOFLAGS
 ARG CGO_CFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-amd64/bin/ollama .
     go build -trimpath -o dist/linux-amd64/bin/ollama .
+RUN cd dist/linux-$GOARCH && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz
 
 
-# Intermediate stage used for ./scripts/build_linux.sh
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 FROM --platform=linux/arm64 cpu-build-arm64 AS build-arm64
 ENV CGO_ENABLED=1
 ENV CGO_ENABLED=1
 ARG GOLANG_VERSION
 ARG GOLANG_VERSION
 WORKDIR /go/src/github.com/ollama/ollama
 WORKDIR /go/src/github.com/ollama/ollama
 COPY . .
 COPY . .
-COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-11-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=cuda-12-build-server-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/ llm/build/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/build/ build/
 ARG GOFLAGS
 ARG GOFLAGS
 ARG CGO_CFLAGS
 ARG CGO_CFLAGS
 RUN --mount=type=cache,target=/root/.ccache \
 RUN --mount=type=cache,target=/root/.ccache \
     go build -trimpath -o dist/linux-arm64/bin/ollama .
     go build -trimpath -o dist/linux-arm64/bin/ollama .
+RUN cd dist/linux-$GOARCH && \
+    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 
 
-# Strip out ROCm dependencies to keep the primary image lean
-FROM --platform=linux/amd64 ubuntu:22.04 AS amd64-libs-without-rocm
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /scratch/
-RUN cd /scratch/ollama/ && rm -rf rocblas libamd* libdrm* libroc* libhip* libhsa*
+FROM --platform=linux/amd64 scratch AS dist-amd64
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+FROM --platform=linux/arm64 scratch AS dist-arm64
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
+FROM dist-$TARGETARCH as dist
+
+
+# Optimized container images do not cary nested payloads
+FROM --platform=linux/amd64 static-build-amd64 AS container-build-amd64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-amd64/bin/ollama .
+
+FROM --platform=linux/arm64 static-build-arm64 AS container-build-arm64
+WORKDIR /go/src/github.com/ollama/ollama
+COPY . .
+ARG GOFLAGS
+ARG CGO_CFLAGS
+RUN --mount=type=cache,target=/root/.ccache \
+    go build -trimpath -o dist/linux-arm64/bin/ollama .
 
 
-# Runtime stages
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
-COPY --from=amd64-libs-without-rocm /scratch/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates && \
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
     apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cuda-11-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cuda-12-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
-RUN apt-get update && apt-get install -y ca-certificates && \
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
     apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=cpu-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-11-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=cuda-12-build-runner-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 
 
-# Radeon images are much larger so we keep it distinct from the CPU/CUDA image
-FROM  rocm/dev-centos-7:${ROCM_VERSION}-complete AS runtime-rocm
-RUN update-pciids
-COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
-RUN ln -s /opt/rocm/lib /lib/ollama
+# ROCm libraries larger so we keep it distinct from the CPU/CUDA image
+FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-rocm
+# Frontload the rocm libraries which are large, and rarely change to increase chance of a common layer
+# across releases
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/lib/ /lib/
+RUN apt-get update && \
+    apt-get install -y ca-certificates && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=cpu-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 EXPOSE 11434
 EXPOSE 11434
 ENV OLLAMA_HOST=0.0.0.0
 ENV OLLAMA_HOST=0.0.0.0
 
 

+ 1 - 0
build/darwin/amd64/placeholder

@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command

+ 1 - 0
build/darwin/arm64/placeholder

@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command

+ 8 - 0
build/embed_darwin_amd64.go

@@ -0,0 +1,8 @@
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/amd64/*
+var EmbedFS embed.FS

+ 8 - 0
build/embed_darwin_arm64.go

@@ -0,0 +1,8 @@
+package build
+
+import "embed"
+
+// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
+
+//go:embed darwin/arm64/*
+var EmbedFS embed.FS

+ 6 - 0
build/embed_linux.go

@@ -0,0 +1,6 @@
+package build
+
+import "embed"
+
+//go:embed linux/*
+var EmbedFS embed.FS

+ 8 - 0
build/embed_unused.go

@@ -0,0 +1,8 @@
+//go:build !linux && !darwin
+
+package build
+
+import "embed"
+
+// unused on windows
+var EmbedFS embed.FS

+ 1 - 0
build/linux/amd64/placeholder

@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command

+ 1 - 0
build/linux/arm64/placeholder

@@ -0,0 +1 @@
+This is here to make sure the build/ directory exists for the go:embed command

+ 0 - 48
envconfig/config.go

@@ -179,53 +179,6 @@ var (
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 	HsaOverrideGfxVersion = String("HSA_OVERRIDE_GFX_VERSION")
 )
 )
 
 
-func RunnersDir() (p string) {
-	if p := Var("OLLAMA_RUNNERS_DIR"); p != "" {
-		return p
-	}
-
-	if runtime.GOOS != "windows" {
-		return
-	}
-
-	defer func() {
-		if p == "" {
-			slog.Error("unable to locate llm runner directory. Set OLLAMA_RUNNERS_DIR to the location of 'ollama/runners'")
-		}
-	}()
-
-	// On Windows we do not carry the payloads inside the main executable
-	exe, err := os.Executable()
-	if err != nil {
-		return
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
-	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			p = candidate
-			break
-		}
-	}
-
-	return p
-}
-
 func Uint(key string, defaultValue uint) func() uint {
 func Uint(key string, defaultValue uint) func() uint {
 	return func() uint {
 	return func() uint {
 		if s := Var(key); s != "" {
 		if s := Var(key); s != "" {
@@ -290,7 +243,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
-		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 
 

+ 0 - 148
gpu/assets.go

@@ -1,148 +0,0 @@
-package gpu
-
-import (
-	"errors"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"syscall"
-	"time"
-
-	"github.com/ollama/ollama/envconfig"
-)
-
-var (
-	lock        sync.Mutex
-	payloadsDir = ""
-)
-
-func PayloadsDir() (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	var err error
-	if payloadsDir == "" {
-		runnersDir := envconfig.RunnersDir()
-
-		if runnersDir != "" {
-			payloadsDir = runnersDir
-			return payloadsDir, nil
-		}
-
-		// The remainder only applies on non-windows where we still carry payloads in the main executable
-		cleanupTmpDirs()
-		tmpDir := envconfig.TmpDir()
-		if tmpDir == "" {
-			tmpDir, err = os.MkdirTemp("", "ollama")
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-			}
-		} else {
-			err = os.MkdirAll(tmpDir, 0o755)
-			if err != nil {
-				return "", fmt.Errorf("failed to generate tmp dir %s: %w", tmpDir, err)
-			}
-		}
-
-		// Track our pid so we can clean up orphaned tmpdirs
-		n := filepath.Join(tmpDir, "ollama.pid")
-		if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-			return "", fmt.Errorf("failed to write pid file %s: %w", n, err)
-		}
-
-		// We create a distinct subdirectory for payloads within the tmpdir
-		// This will typically look like /tmp/ollama3208993108/runners on linux
-		payloadsDir = filepath.Join(tmpDir, "runners")
-	}
-	return payloadsDir, nil
-}
-
-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	matches, err := filepath.Glob(filepath.Join(os.TempDir(), "ollama*", "ollama.pid"))
-	if err != nil {
-		return
-	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
-			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		pid, err := strconv.Atoi(string(raw))
-		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
-			continue
-		}
-
-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
-		}
-	}
-}
-
-func Cleanup() {
-	lock.Lock()
-	defer lock.Unlock()
-	runnersDir := envconfig.RunnersDir()
-	if payloadsDir != "" && runnersDir == "" && runtime.GOOS != "windows" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(payloadsDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
-			time.Sleep(1000 * time.Millisecond)
-			err = os.RemoveAll(tmpDir)
-			if err != nil {
-				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-			}
-		}
-	}
-}
-
-func UpdatePath(dir string) {
-	if runtime.GOOS == "windows" {
-		tmpDir := filepath.Dir(dir)
-		pathComponents := strings.Split(os.Getenv("PATH"), ";")
-		i := 0
-		for _, comp := range pathComponents {
-			if strings.EqualFold(comp, dir) {
-				return
-			}
-			// Remove any other prior paths to our temp dir
-			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
-				pathComponents[i] = comp
-				i++
-			}
-		}
-		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
-		slog.Info("updating", "PATH", newPath)
-		os.Setenv("PATH", newPath)
-	}
-	// linux and darwin rely on rpath
-}

+ 3 - 4
gpu/gpu.go

@@ -93,10 +93,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
 	}
-	tmpDir, _ := PayloadsDir()
-	if tmpDir != "" {
-		// TODO - add "payloads" for subprocess
-		cudartMgmtPatterns = []string{filepath.Join(tmpDir, "cuda*", CudartMgmtName)}
+	libDir := LibraryDir()
+	if libDir != "" {
+		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
 	}
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 
 

+ 25 - 5
llm/generate/gen_common.sh

@@ -31,6 +31,7 @@ init_vars() {
         NO_WHOLE_ARCHIVE=""
         NO_WHOLE_ARCHIVE=""
         GCC_ARCH="-arch ${ARCH}"
         GCC_ARCH="-arch ${ARCH}"
         DIST_BASE=../../dist/darwin-${GOARCH}/
         DIST_BASE=../../dist/darwin-${GOARCH}/
+        PAYLOAD_BASE=../../build/darwin/${GOARCH}
         ;;
         ;;
     "Linux")
     "Linux")
         LIB_EXT="so"
         LIB_EXT="so"
@@ -40,6 +41,7 @@ init_vars() {
         # Cross compiling not supported on linux - Use docker
         # Cross compiling not supported on linux - Use docker
         GCC_ARCH=""
         GCC_ARCH=""
         DIST_BASE=../../dist/linux-${GOARCH}/
         DIST_BASE=../../dist/linux-${GOARCH}/
+        PAYLOAD_BASE=../../build/linux/${GOARCH}
         ;;
         ;;
     *)
     *)
         ;;
         ;;
@@ -47,7 +49,8 @@ init_vars() {
     if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
     if [ -z "${CMAKE_CUDA_ARCHITECTURES}" ] ; then
         CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
         CMAKE_CUDA_ARCHITECTURES="50;52;61;70;75;80"
     fi
     fi
-    GZIP=$(which pigz 2>/dev/null || echo "gzip")
+    GZIP=$(command -v pigz 2>/dev/null || echo "gzip")
+    RUNNER_BASE="${DIST_BASE}/lib/ollama/runners"
 }
 }
 
 
 git_module_setup() {
 git_module_setup() {
@@ -91,17 +94,34 @@ build() {
     rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
     rm -f ${BUILD_DIR}/bin/ggml-common.h ${BUILD_DIR}/bin/ggml-metal.metal
 }
 }
 
 
+dist() {
+    [ -z "${RUNNER}" ] && exit 1
+    mkdir -p ${RUNNER_BASE}/${RUNNER}/
+    for f in ${BUILD_DIR}/bin/* ; do
+        cp ${f} ${RUNNER_BASE}/${RUNNER}/
+    done
+    # check for lib directory
+    if [ -d ${BUILD_DIR}/lib ]; then
+        for f in ${BUILD_DIR}/lib/* ; do
+            cp ${f} ${RUNNER_BASE}/${RUNNER}/
+        done
+    fi
+}
+
+# Compress from the build $BUILD_DIR into the $PAYLOAD_BASE/$RUNNER dir
 compress() {
 compress() {
-    echo "Compressing payloads to reduce overall binary size..."
-    rm -rf ${BUILD_DIR}/bin/*.gz
+    [ -z "${RUNNER}" ] && exit 1
+    echo "Compressing payloads with ${GZIP} to reduce overall binary size..."
+    rm -rf "${PAYLOAD_BASE}/${RUNNER}/"
+    mkdir -p "${PAYLOAD_BASE}/${RUNNER}/"
     for f in ${BUILD_DIR}/bin/* ; do
     for f in ${BUILD_DIR}/bin/* ; do
-        ${GZIP} -n --best -f ${f} &
+        ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
         compress_pids+=" $!"
         compress_pids+=" $!"
     done
     done
     # check for lib directory
     # check for lib directory
     if [ -d ${BUILD_DIR}/lib ]; then
     if [ -d ${BUILD_DIR}/lib ]; then
         for f in ${BUILD_DIR}/lib/* ; do
         for f in ${BUILD_DIR}/lib/* ; do
-            ${GZIP} -n --best -f ${f} &
+            ${GZIP} -c --best ${f} > "${PAYLOAD_BASE}/${RUNNER}/$(basename ${f}).gz" &
             compress_pids+=" $!"
             compress_pids+=" $!"
         done
         done
     fi
     fi

+ 8 - 4
llm/generate/gen_darwin.sh

@@ -39,7 +39,8 @@ case "${GOARCH}" in
         #
         #
         init_vars
         init_vars
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu"
+        RUNNER=cpu
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
         echo "Building LCD CPU"
         echo "Building LCD CPU"
         build
         build
         sign ${BUILD_DIR}/bin/ollama_llama_server
         sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -51,7 +52,8 @@ case "${GOARCH}" in
         #
         #
         init_vars
         init_vars
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=off -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
+        RUNNER=cpu_avx
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
         echo "Building AVX CPU"
         echo "Building AVX CPU"
         build
         build
         sign ${BUILD_DIR}/bin/ollama_llama_server
         sign ${BUILD_DIR}/bin/ollama_llama_server
@@ -63,7 +65,8 @@ case "${GOARCH}" in
         #
         #
         init_vars
         init_vars
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
         CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_ACCELERATE=on -DGGML_BLAS=off -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
+        RUNNER=cpu_avx2
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
         echo "Building AVX2 CPU"
         echo "Building AVX2 CPU"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
         build
         build
@@ -84,7 +87,8 @@ case "${GOARCH}" in
     if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
     if [ -z "$OLLAMA_SKIP_METAL_GENERATE" ]; then
         init_vars
         init_vars
         CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
         CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
-        BUILD_DIR="../build/darwin/${ARCH}/metal"
+        RUNNER="metal"
+        BUILD_DIR="../build/darwin/${ARCH}/${RUNNER}"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
         EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
         build
         build
         sign ${BUILD_DIR}/bin/ollama_llama_server
         sign ${BUILD_DIR}/bin/ollama_llama_server

+ 26 - 9
llm/generate/gen_linux.sh

@@ -79,10 +79,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         init_vars
         init_vars
         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
         CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
         CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
-        BUILD_DIR="../build/linux/${ARCH}/cpu"
+        RUNNER="cpu"
+        BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
         echo "Building custom CPU"
         echo "Building custom CPU"
         build
         build
         install
         install
+        dist
         compress
         compress
     else
     else
         # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
         # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
@@ -102,10 +104,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
             #
             #
             init_vars
             init_vars
             CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
             CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-            BUILD_DIR="../build/linux/${ARCH}/cpu"
+            RUNNER=cpu
+            BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
             echo "Building LCD CPU"
             echo "Building LCD CPU"
             build
             build
             install
             install
+            dist
             compress
             compress
         fi
         fi
 
 
@@ -120,10 +124,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 #
                 #
                 init_vars
                 init_vars
                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
+                RUNNER=cpu_avx
+                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                 echo "Building AVX CPU"
                 echo "Building AVX CPU"
                 build
                 build
                 install
                 install
+                dist
                 compress
                 compress
             fi
             fi
 
 
@@ -134,10 +140,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
                 #
                 #
                 init_vars
                 init_vars
                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on ${CMAKE_DEFS}"
-                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
+                RUNNER=cpu_avx2
+                BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
                 echo "Building AVX2 CPU"
                 echo "Building AVX2 CPU"
                 build
                 build
                 install
                 install
+                dist
                 compress
                 compress
             fi
             fi
         fi
         fi
@@ -187,11 +195,13 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then
     fi
     fi
     export CUDAFLAGS="-t8"
     export CUDAFLAGS="-t8"
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS} ${CMAKE_CUDA_DEFS} -DGGML_STATIC=off"
-    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
+    RUNNER=cuda${CUDA_VARIANT}
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
     export LLAMA_SERVER_LDFLAGS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
     CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
     CUDA_DIST_DIR="${CUDA_DIST_DIR:-${DIST_BASE}/lib/ollama}"
     build
     build
     install
     install
+    dist
     echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
     echo "Installing CUDA dependencies in ${CUDA_DIST_DIR}"
     mkdir -p "${CUDA_DIST_DIR}"
     mkdir -p "${CUDA_DIST_DIR}"
     for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
     for lib in ${CUDA_LIB_DIR}/libcudart.so* ${CUDA_LIB_DIR}/libcublas.so* ${CUDA_LIB_DIR}/libcublasLt.so* ; do
@@ -212,7 +222,8 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
     source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
     CC=icx
     CC=icx
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_SYCL_F16=OFF"
-    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    RUNNER=oneapi
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
     ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
     ONEAPI_DIST_DIR="${DIST_BASE}/lib/ollama"
     export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     export LLAMA_SERVER_LDFLAGS="-fsycl -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
     DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
@@ -231,6 +242,7 @@ if [ -z "${OLLAMA_SKIP_ONEAPI_GENERATE}" -a -d "${ONEAPI_ROOT}" ]; then
     cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
     cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${ONEAPI_DIST_DIR}"
     cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
     cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${ONEAPI_DIST_DIR}"
     install
     install
+    dist
     compress
     compress
 fi
 fi
 
 
@@ -259,7 +271,8 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
         CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
         CMAKE_DEFS="${CMAKE_DEFS} ${OLLAMA_CUSTOM_ROCM_DEFS}"
         echo "Building custom ROCM GPU"
         echo "Building custom ROCM GPU"
     fi
     fi
-    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
+    RUNNER=rocm${ROCM_VARIANT}
+    BUILD_DIR="../build/linux/${ARCH}/${RUNNER}"
     # ROCm dependencies are too large to fit into a unified bundle
     # ROCm dependencies are too large to fit into a unified bundle
     ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
     ROCM_DIST_DIR="${DIST_BASE}/../linux-${GOARCH}-rocm/lib/ollama"
     # TODO figure out how to disable runpath (rpath)
     # TODO figure out how to disable runpath (rpath)
@@ -269,13 +282,17 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
 
 
     # copy the ROCM dependencies
     # copy the ROCM dependencies
     mkdir -p "${ROCM_DIST_DIR}"
     mkdir -p "${ROCM_DIST_DIR}"
-    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo ); do
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -v "${ARCH}/rocm${ROCM_VARIANT}" | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf ); do
         cp -a "${dep}"* "${ROCM_DIST_DIR}"
         cp -a "${dep}"* "${ROCM_DIST_DIR}"
+        if [ $(readlink -f "${dep}") != "${dep}" ] ; then
+            cp $(readlink -f "${dep}") "${ROCM_DIST_DIR}"
+        fi
     done
     done
     install
     install
+    dist
     compress
     compress
 fi
 fi
 
 
 cleanup
 cleanup
 wait_for_compress
 wait_for_compress
-echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
+echo "go generate completed.  LLM runners: $(cd ${PAYLOAD_BASE}; echo *)"

+ 0 - 4
llm/llm_darwin_arm64.go → llm/llm_darwin.go

@@ -1,11 +1,7 @@
 package llm
 package llm
 
 
 import (
 import (
-	"embed"
 	"syscall"
 	"syscall"
 )
 )
 
 
-//go:embed build/darwin/arm64/*/bin/*
-var libEmbed embed.FS
-
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

+ 0 - 11
llm/llm_darwin_amd64.go

@@ -1,11 +0,0 @@
-package llm
-
-import (
-	"embed"
-	"syscall"
-)
-
-//go:embed build/darwin/x86_64/*/bin/*
-var libEmbed embed.FS
-
-var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

+ 0 - 4
llm/llm_linux.go

@@ -1,11 +1,7 @@
 package llm
 package llm
 
 
 import (
 import (
-	"embed"
 	"syscall"
 	"syscall"
 )
 )
 
 
-//go:embed build/linux/*/*/bin/*
-var libEmbed embed.FS
-
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

+ 0 - 4
llm/llm_windows.go

@@ -1,13 +1,9 @@
 package llm
 package llm
 
 
 import (
 import (
-	"embed"
 	"syscall"
 	"syscall"
 )
 )
 
 
-// unused on windows
-var libEmbed embed.FS
-
 const CREATE_DEFAULT_ERROR_MODE = 0x04000000
 const CREATE_DEFAULT_ERROR_MODE = 0x04000000
 
 
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{

+ 0 - 233
llm/payload.go

@@ -1,233 +0,0 @@
-package llm
-
-import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"runtime"
-	"slices"
-	"strings"
-
-	"golang.org/x/sync/errgroup"
-
-	"github.com/ollama/ollama/gpu"
-)
-
-var errPayloadMissing = errors.New("expected payloads not included in this build of ollama")
-
-func Init() error {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		return err
-	}
-
-	if runtime.GOOS != "windows" {
-		slog.Info("extracting embedded files", "dir", payloadsDir)
-		binGlob := "build/*/*/*/bin/*"
-
-		// extract server libraries
-		err = extractFiles(payloadsDir, binGlob)
-		if err != nil {
-			return fmt.Errorf("extract binaries: %v", err)
-		}
-	}
-
-	var variants []string
-	for v := range getAvailableServers() {
-		variants = append(variants, v)
-	}
-	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-
-	return nil
-}
-
-// binary names may contain an optional variant separated by '_'
-// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
-// Any library without a variant is the lowest common denominator
-func getAvailableServers() map[string]string {
-	payloadsDir, err := gpu.PayloadsDir()
-	if err != nil {
-		slog.Error("payload lookup error", "error", err)
-		return nil
-	}
-
-	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
-
-	files, err := filepath.Glob(pattern)
-	if err != nil {
-		slog.Debug("could not glob", "pattern", pattern, "error", err)
-		return nil
-	}
-
-	servers := make(map[string]string)
-	for _, file := range files {
-		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
-	}
-
-	return servers
-}
-
-// serversForGpu returns a list of compatible servers give the provided GPU
-// info, ordered by performance. assumes Init() has been called
-// TODO - switch to metadata based mapping
-func serversForGpu(info gpu.GpuInfo) []string {
-	// glob workDir for files that start with ollama_
-	availableServers := getAvailableServers()
-	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone.String() {
-		requested += "_" + info.Variant
-	}
-
-	servers := []string{}
-
-	// exact match first
-	for a := range availableServers {
-		if a == requested {
-			servers = []string{a}
-
-			if a == "metal" {
-				return servers
-			}
-
-			break
-		}
-	}
-
-	alt := []string{}
-
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if info.Library != "cpu" {
-		for a := range availableServers {
-			if info.Library == strings.Split(a, "_")[0] && a != requested {
-				alt = append(alt, a)
-			}
-		}
-
-		slices.Sort(alt)
-		servers = append(servers, alt...)
-	}
-
-	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
-		// Load up the best CPU variant if not primary requested
-		if info.Library != "cpu" {
-			variant := gpu.GetCPUCapability()
-			// If no variant, then we fall back to default
-			// If we have a variant, try that if we find an exact match
-			// Attempting to run the wrong CPU instructions will panic the
-			// process
-			if variant != gpu.CPUCapabilityNone {
-				for cmp := range availableServers {
-					if cmp == "cpu_"+variant.String() {
-						servers = append(servers, cmp)
-						break
-					}
-				}
-			} else {
-				servers = append(servers, "cpu")
-			}
-		}
-
-		if len(servers) == 0 {
-			servers = []string{"cpu"}
-		}
-	}
-
-	return servers
-}
-
-// Return the optimal server for this CPU architecture
-func serverForCpu() string {
-	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return "metal"
-	}
-	variant := gpu.GetCPUCapability()
-	availableServers := getAvailableServers()
-	if variant != gpu.CPUCapabilityNone {
-		for cmp := range availableServers {
-			if cmp == "cpu_"+variant.String() {
-				return cmp
-			}
-		}
-	}
-	return "cpu"
-}
-
-// extract extracts the embedded files to the target directory
-func extractFiles(targetDir string, glob string) error {
-	files, err := fs.Glob(libEmbed, glob)
-	if err != nil || len(files) == 0 {
-		return errPayloadMissing
-	}
-
-	if err := os.MkdirAll(targetDir, 0o755); err != nil {
-		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
-	}
-
-	g := new(errgroup.Group)
-
-	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
-	for _, file := range files {
-		filename := file
-
-		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
-
-		slog.Debug("extracting", "variant", variant, "file", filename)
-
-		g.Go(func() error {
-			srcf, err := libEmbed.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer srcf.Close()
-
-			src := io.Reader(srcf)
-			if strings.HasSuffix(filename, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", filename, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-
-			variantDir := filepath.Join(targetDir, variant)
-			if err := os.MkdirAll(variantDir, 0o755); err != nil {
-				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
-			}
-
-			base := filepath.Base(filename)
-			destFilename := filepath.Join(variantDir, base)
-
-			_, err = os.Stat(destFilename)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", filename, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", filename, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", filename, err)
-			}
-			return nil
-		})
-	}
-
-	err = g.Wait()
-	if err != nil {
-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
-		gpu.Cleanup()
-		return err
-	}
-	return nil
-}

+ 13 - 16
llm/server.go

@@ -24,9 +24,11 @@ import (
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/sync/semaphore"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/runners"
 )
 )
 
 
 type LlamaServer interface {
 type LlamaServer interface {
@@ -106,7 +108,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		gpus = gpu.GetCPUInfo()
 		gpus = gpu.GetCPUInfo()
 	}
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
-		cpuRunner = serverForCpu()
+		cpuRunner = runners.ServerForCpu()
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 	} else {
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
@@ -118,7 +120,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			opts.NumGPU = 0
 			opts.NumGPU = 0
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			// Don't bother loading into the GPU if no layers can fit
-			cpuRunner = serverForCpu()
+			cpuRunner = runners.ServerForCpu()
 			gpus = gpu.GetCPUInfo()
 			gpus = gpu.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 			opts.NumGPU = estimate.Layers
@@ -145,25 +147,20 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
 	}
 	}
 
 
-	availableServers := getAvailableServers()
+	rDir, err := runners.Refresh(build.EmbedFS)
+	if err != nil {
+		return nil, err
+	}
+
+	availableServers := runners.GetAvailableServers(rDir)
 	if len(availableServers) == 0 {
 	if len(availableServers) == 0 {
-		if runtime.GOOS != "windows" {
-			slog.Warn("llama server binary disappeared, reinitializing payloads")
-			err = Init()
-			if err != nil {
-				slog.Warn("failed to reinitialize payloads", "error", err)
-				return nil, err
-			}
-			availableServers = getAvailableServers()
-		} else {
-			return nil, finalErr
-		}
+		return nil, finalErr
 	}
 	}
 	var servers []string
 	var servers []string
 	if cpuRunner != "" {
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 		servers = []string{cpuRunner}
 	} else {
 	} else {
-		servers = serversForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
 	}
 	}
 	demandLib := envconfig.LLMLibrary()
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
 	if demandLib != "" {
@@ -330,7 +327,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		_, err := os.Stat(server)
 		_, err := os.Stat(server)
 		if errors.Is(err, os.ErrNotExist) {
 		if errors.Is(err, os.ErrNotExist) {
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
 			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			err = Init()
+			_, err = runners.Refresh(build.EmbedFS)
 			if err != nil {
 			if err != nil {
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				slog.Warn("failed to reinitialize payloads", "error", err)
 				return nil, err
 				return nil, err

+ 384 - 0
runners/common.go

@@ -0,0 +1,384 @@
+package runners
+
+import (
+	"compress/gzip"
+	"errors"
+	"fmt"
+	"io"
+	"io/fs"
+	"log/slog"
+	"os"
+	"path/filepath"
+	"runtime"
+	"slices"
+	"strconv"
+	"strings"
+	"sync"
+	"syscall"
+
+	"golang.org/x/sync/errgroup"
+
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/gpu"
+)
+
+const (
+	binGlob = "*/*/*/*"
+)
+
+var (
+	lock       sync.Mutex
+	runnersDir = ""
+)
+
+// Return the location where runners are stored
+// If runners are payloads, this will either extract them
+// or refresh them if any have disappeared due to tmp cleaners
+func Refresh(payloadFS fs.FS) (string, error) {
+	lock.Lock()
+	defer lock.Unlock()
+	var err error
+
+	// Wire up extra logging on our first load
+	if runnersDir == "" {
+		defer func() {
+			var runners []string
+			for v := range GetAvailableServers(runnersDir) {
+				runners = append(runners, v)
+			}
+			slog.Info("Dynamic LLM libraries", "runners", runners)
+			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
+		}()
+	}
+
+	if hasPayloads(payloadFS) {
+		if runnersDir == "" {
+			runnersDir, err = extractRunners(payloadFS)
+		} else {
+			err = refreshRunners(payloadFS, runnersDir)
+		}
+	} else if runnersDir == "" {
+		runnersDir, err = locateRunners()
+	}
+
+	return runnersDir, err
+}
+
+func Cleanup(payloadFS fs.FS) {
+	lock.Lock()
+	defer lock.Unlock()
+	if hasPayloads(payloadFS) && runnersDir != "" {
+		// We want to fully clean up the tmpdir parent of the payloads dir
+		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
+		slog.Debug("cleaning up", "dir", tmpDir)
+		err := os.RemoveAll(tmpDir)
+		if err != nil {
+			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
+		}
+	}
+}
+
+func locateRunners() (string, error) {
+	exe, err := os.Executable()
+	if err != nil {
+		return "", err
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		return "", err
+	}
+
+	var paths []string
+	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
+		paths = append(paths,
+			root,
+			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
+			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
+		)
+	}
+
+	// Try a few variations to improve developer experience when building from source in the local tree
+	for _, path := range paths {
+		candidate := filepath.Join(path, "lib", "ollama", "runners")
+		if _, err := os.Stat(candidate); err == nil {
+			return candidate, nil
+		}
+	}
+	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
+}
+
+// Return true if we're carying nested payloads for the runners
+func hasPayloads(payloadFS fs.FS) bool {
+	files, err := fs.Glob(payloadFS, binGlob)
+	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
+		return false
+	}
+	return true
+}
+
+func extractRunners(payloadFS fs.FS) (string, error) {
+	cleanupTmpDirs()
+	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
+	if err != nil {
+		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
+	}
+	// Track our pid so we can clean up orphaned tmpdirs
+	n := filepath.Join(tmpDir, "ollama.pid")
+	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
+		slog.Warn("failed to write pid file", "file", n, "error", err)
+	}
+	// We create a distinct subdirectory for payloads within the tmpdir
+	// This will typically look like /tmp/ollama3208993108/runners on linux
+	rDir := filepath.Join(tmpDir, "runners")
+
+	slog.Info("extracting embedded files", "dir", rDir)
+	return rDir, refreshRunners(payloadFS, rDir)
+}
+
+func refreshRunners(payloadFS fs.FS, rDir string) error {
+	// extract or refresh server libraries
+	err := extractFiles(payloadFS, rDir, binGlob)
+	if err != nil {
+		return fmt.Errorf("extract binaries: %v", err)
+	}
+	return nil
+}
+
+// extract extracts the embedded files to the target directory
+func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
+	files, err := fs.Glob(payloadFS, glob)
+	if err != nil || len(files) == 0 {
+		// Should not happen
+		return fmt.Errorf("extractFiles called without payload present")
+	}
+
+	if err := os.MkdirAll(targetDir, 0o755); err != nil {
+		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
+	}
+
+	g := new(errgroup.Group)
+
+	// $OS/$GOARCH/$RUNNER/$FILE
+	for _, file := range files {
+		filename := file
+
+		runner := filepath.Base(filepath.Dir(filename))
+
+		slog.Debug("extracting", "runner", runner, "payload", filename)
+
+		g.Go(func() error {
+			srcf, err := payloadFS.Open(filename)
+			if err != nil {
+				return err
+			}
+			defer srcf.Close()
+
+			src := io.Reader(srcf)
+			if strings.HasSuffix(filename, ".gz") {
+				src, err = gzip.NewReader(src)
+				if err != nil {
+					return fmt.Errorf("decompress payload %s: %v", filename, err)
+				}
+				filename = strings.TrimSuffix(filename, ".gz")
+			}
+
+			runnerDir := filepath.Join(targetDir, runner)
+			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
+				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
+			}
+
+			base := filepath.Base(filename)
+			destFilename := filepath.Join(runnerDir, base)
+
+			_, err = os.Stat(destFilename)
+			switch {
+			case errors.Is(err, os.ErrNotExist):
+				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
+				if err != nil {
+					return fmt.Errorf("write payload %s: %v", filename, err)
+				}
+				defer destFile.Close()
+				if _, err := io.Copy(destFile, src); err != nil {
+					return fmt.Errorf("copy payload %s: %v", filename, err)
+				}
+			case err != nil:
+				return fmt.Errorf("stat payload %s: %v", filename, err)
+			}
+			return nil
+		})
+	}
+
+	err = g.Wait()
+	if err != nil {
+		slog.Error("failed to extract files", "error", err)
+		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
+		err := os.RemoveAll(targetDir)
+		if err != nil {
+			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
+		}
+		return err
+	}
+	return nil
+}
+
+// Best effort to clean up prior tmpdirs
+func cleanupTmpDirs() {
+	tmpDir := envconfig.TmpDir()
+	if tmpDir == "" {
+		tmpDir = os.TempDir()
+	}
+	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
+	if err != nil {
+		return
+	}
+
+	for _, match := range matches {
+		raw, err := os.ReadFile(match)
+		if errors.Is(err, os.ErrNotExist) {
+			slog.Debug("not a ollama runtime directory, skipping", "path", match)
+			continue
+		} else if err != nil {
+			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
+			continue
+		}
+
+		pid, err := strconv.Atoi(string(raw))
+		if err != nil {
+			slog.Warn("invalid pid, skipping", "path", match, "error", err)
+			continue
+		}
+
+		p, err := os.FindProcess(pid)
+		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
+			slog.Warn("process still running, skipping", "pid", pid, "path", match)
+			continue
+		}
+
+		if err := os.Remove(match); err != nil {
+			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
+		}
+
+		runners := filepath.Join(filepath.Dir(match), "runners")
+		if err := os.RemoveAll(runners); err != nil {
+			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
+		}
+
+		if err := os.Remove(filepath.Dir(match)); err != nil {
+			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
+		}
+	}
+}
+
+// directory names are the name of the runner and may contain an optional
+// variant prefixed with '_' as the separator. For example, "cuda_v11" and
+// "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
+// lowest common denominator
+func GetAvailableServers(payloadsDir string) map[string]string {
+	if payloadsDir == "" {
+		slog.Error("empty runner dir")
+		return nil
+	}
+
+	// glob payloadsDir for files that start with ollama_
+	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
+
+	files, err := filepath.Glob(pattern)
+	if err != nil {
+		slog.Debug("could not glob", "pattern", pattern, "error", err)
+		return nil
+	}
+
+	servers := make(map[string]string)
+	for _, file := range files {
+		slog.Debug("availableServers : found", "file", file)
+		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+	}
+
+	return servers
+}
+
+// serversForGpu returns a list of compatible servers give the provided GPU
+// info, ordered by performance. assumes Init() has been called
+// TODO - switch to metadata based mapping
+func ServersForGpu(info gpu.GpuInfo) []string {
+	// glob workDir for files that start with ollama_
+	availableServers := GetAvailableServers(runnersDir)
+	requested := info.Library
+	if info.Variant != gpu.CPUCapabilityNone.String() {
+		requested += "_" + info.Variant
+	}
+
+	servers := []string{}
+
+	// exact match first
+	for a := range availableServers {
+		if a == requested {
+			servers = []string{a}
+
+			if a == "metal" {
+				return servers
+			}
+
+			break
+		}
+	}
+
+	alt := []string{}
+
+	// Then for GPUs load alternates and sort the list for consistent load ordering
+	if info.Library != "cpu" {
+		for a := range availableServers {
+			if info.Library == strings.Split(a, "_")[0] && a != requested {
+				alt = append(alt, a)
+			}
+		}
+
+		slices.Sort(alt)
+		servers = append(servers, alt...)
+	}
+
+	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
+		// Load up the best CPU variant if not primary requested
+		if info.Library != "cpu" {
+			variant := gpu.GetCPUCapability()
+			// If no variant, then we fall back to default
+			// If we have a variant, try that if we find an exact match
+			// Attempting to run the wrong CPU instructions will panic the
+			// process
+			if variant != gpu.CPUCapabilityNone {
+				for cmp := range availableServers {
+					if cmp == "cpu_"+variant.String() {
+						servers = append(servers, cmp)
+						break
+					}
+				}
+			} else {
+				servers = append(servers, "cpu")
+			}
+		}
+
+		if len(servers) == 0 {
+			servers = []string{"cpu"}
+		}
+	}
+
+	return servers
+}
+
+// Return the optimal server for this CPU architecture
+func ServerForCpu() string {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+		return "metal"
+	}
+	variant := gpu.GetCPUCapability()
+	availableServers := GetAvailableServers(runnersDir)
+	if variant != gpu.CPUCapabilityNone {
+		for cmp := range availableServers {
+			if cmp == "cpu_"+variant.String() {
+				return cmp
+			}
+		}
+	}
+	return "cpu"
+}

+ 50 - 0
runners/runners_test.go

@@ -0,0 +1,50 @@
+package runners
+
+import (
+	"log/slog"
+	"os"
+	"path"
+	"runtime"
+	"strings"
+	"testing"
+	"testing/fstest"
+)
+
+func TestRefreshRunners(t *testing.T) {
+	slog.SetLogLoggerLevel(slog.LevelDebug)
+
+	payloadFS := fstest.MapFS{
+		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
+	}
+	tmpDir, err := os.MkdirTemp("", "testing")
+	if err != nil {
+		t.Fatalf("failed to make tmp dir %s", err)
+	}
+	t.Setenv("OLLAMA_TMPDIR", tmpDir)
+	rDir, err := Refresh(payloadFS)
+	if err != nil {
+		t.Fatalf("failed to extract to %s %s", tmpDir, err)
+	}
+	if !strings.Contains(rDir, tmpDir) {
+		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
+	}
+
+	// spot check results
+	servers := GetAvailableServers(rDir)
+	if len(servers) < 1 {
+		t.Fatalf("expected at least 1 server")
+	}
+
+	// Refresh contents
+	rDir, err = extractRunners(payloadFS)
+	if err != nil {
+		t.Fatalf("failed to extract to %s %s", tmpDir, err)
+	}
+	if !strings.Contains(rDir, tmpDir) {
+		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
+	}
+
+	cleanupTmpDirs()
+
+	Cleanup(payloadFS)
+}

+ 1 - 2
scripts/build_darwin.sh

@@ -2,8 +2,7 @@
 
 
 set -e
 set -e
 
 
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
-export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+. $(dirname $0)/env.sh
 
 
 mkdir -p dist
 mkdir -p dist
 
 

+ 21 - 63
scripts/build_docker.sh

@@ -2,76 +2,34 @@
 
 
 set -eu
 set -eu
 
 
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
-export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-
-# We use 2 different image repositories to handle combining architecture images into multiarch manifest
-# (The ROCm image is x86 only and is not a multiarch manifest)
-# For developers, you can override the DOCKER_ORG to generate multiarch manifests
-#  DOCKER_ORG=jdoe PUSH=1 ./scripts/build_docker.sh
-DOCKER_ORG=${DOCKER_ORG:-"ollama"}
-RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
-FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
-
-BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
+. $(dirname $0)/env.sh
 
 
 # Set PUSH to a non-empty string to trigger push instead of load
 # Set PUSH to a non-empty string to trigger push instead of load
 PUSH=${PUSH:-""}
 PUSH=${PUSH:-""}
 
 
-# In CI mode, we break things down
-OLLAMA_SKIP_MANIFEST_CREATE=${OLLAMA_SKIP_MANIFEST_CREATE:-""}
-OLLAMA_SKIP_IMAGE_BUILD=${OLLAMA_SKIP_IMAGE_BUILD:-""}
-
 if [ -z "${PUSH}" ] ; then
 if [ -z "${PUSH}" ] ; then
+    echo "Building ${FINAL_IMAGE_REPO}:$VERSION locally.  set PUSH=1 to push"
     LOAD_OR_PUSH="--load"
     LOAD_OR_PUSH="--load"
 else
 else
-    echo "Will be pushing ${RELEASE_IMAGE_REPO}:$VERSION for ${BUILD_ARCH}"
+    echo "Will be pushing ${FINAL_IMAGE_REPO}:$VERSION"
     LOAD_OR_PUSH="--push"
     LOAD_OR_PUSH="--push"
 fi
 fi
 
 
-if [ -z "${OLLAMA_SKIP_IMAGE_BUILD}" ]; then
-    for TARGETARCH in ${BUILD_ARCH}; do
-        docker build \
-            ${LOAD_OR_PUSH} \
-            --platform=linux/${TARGETARCH} \
-            --build-arg=VERSION \
-            --build-arg=GOFLAGS \
-            -f Dockerfile \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-${TARGETARCH} \
-            .
-    done
-
-    if echo ${BUILD_ARCH} | grep "amd64" > /dev/null; then
-        docker build \
-            ${LOAD_OR_PUSH} \
-            --platform=linux/amd64 \
-            --build-arg=VERSION \
-            --build-arg=GOFLAGS \
-            --target runtime-rocm \
-            -f Dockerfile \
-            -t ${RELEASE_IMAGE_REPO}:$VERSION-rocm \
-            .
-    fi
-fi
-
-if [ -z "${OLLAMA_SKIP_MANIFEST_CREATE}" ]; then
-    if [ -n "${PUSH}" ]; then
-        docker manifest create ${FINAL_IMAGE_REPO}:$VERSION \
-            ${RELEASE_IMAGE_REPO}:$VERSION-amd64 \
-            ${RELEASE_IMAGE_REPO}:$VERSION-arm64
-        docker manifest push ${FINAL_IMAGE_REPO}:$VERSION
-
-        # For symmetry, tag/push the rocm image
-        if [ "${RELEASE_IMAGE_REPO}" != "${FINAL_IMAGE_REPO}" ]; then
-            echo "Tagging and pushing rocm image"
-            docker pull ${RELEASE_IMAGE_REPO}:$VERSION-rocm
-            docker tag ${RELEASE_IMAGE_REPO}:$VERSION-rocm ${FINAL_IMAGE_REPO}:$VERSION-rocm
-            docker push ${FINAL_IMAGE_REPO}:$VERSION-rocm
-        fi
-    else
-        echo "Skipping manifest generation when not pushing images are available locally as "
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-amd64"
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-arm64"
-        echo "  ${RELEASE_IMAGE_REPO}:$VERSION-rocm"
-    fi
-fi
+docker buildx build \
+    ${LOAD_OR_PUSH} \
+    --platform=${PLATFORM} \
+    ${OLLAMA_COMMON_BUILD_ARGS} \
+    -f Dockerfile \
+    -t ${FINAL_IMAGE_REPO}:$VERSION \
+    .
+
+if echo $PLATFORM | grep "amd64" > /dev/null; then
+    docker buildx build \
+        ${LOAD_OR_PUSH} \
+        --platform=linux/amd64 \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
+        --target runtime-rocm \
+        -f Dockerfile \
+        -t ${FINAL_IMAGE_REPO}:$VERSION-rocm \
+        .
+fi

+ 20 - 28
scripts/build_linux.sh

@@ -1,37 +1,29 @@
 #!/bin/sh
 #!/bin/sh
+#
+# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
+#
+# docker context create amd64 --docker host=ssh://mybuildhost
+# docker buildx create --name mybuilder amd64 --platform linux/amd64
+# docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
+# docker buildx use mybuilder
+
 
 
 set -eu
 set -eu
 
 
-export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
-export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
-GZIP=$(which pigz 2>/dev/null || echo "gzip")
+. $(dirname $0)/env.sh
 
 
-BUILD_ARCH=${BUILD_ARCH:-"amd64 arm64"}
-export AMDGPU_TARGETS=${AMDGPU_TARGETS:=""}
 mkdir -p dist
 mkdir -p dist
 
 
-for TARGETARCH in ${BUILD_ARCH}; do
-    docker build \
-        --platform=linux/$TARGETARCH \
-        --build-arg=GOFLAGS \
-        --build-arg=CGO_CFLAGS \
-        --build-arg=OLLAMA_CUSTOM_CPU_DEFS \
-        --build-arg=AMDGPU_TARGETS \
-        --target build-$TARGETARCH \
+docker buildx build \
+        --output type=local,dest=./dist/ \
+        --platform=${PLATFORM} \
+        ${OLLAMA_COMMON_BUILD_ARGS} \
+        --target dist \
         -f Dockerfile \
         -f Dockerfile \
-        -t builder:$TARGETARCH \
         .
         .
-    docker create --platform linux/$TARGETARCH --name builder-$TARGETARCH builder:$TARGETARCH
-    rm -rf ./dist/linux-$TARGETARCH
-    docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH ./dist
-    if echo ${TARGETARCH} | grep "amd64" > /dev/null; then
-        docker cp builder-$TARGETARCH:/go/src/github.com/ollama/ollama/dist/linux-$TARGETARCH-rocm ./dist
-    fi
-    docker rm builder-$TARGETARCH
-    echo "Compressing final linux bundle..."
-    rm -f ./dist/ollama-linux-$TARGETARCH.tgz
-    (cd dist/linux-$TARGETARCH && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH.tgz )
-    if [ -d dist/linux-$TARGETARCH-rocm ]; then
-        (cd dist/linux-$TARGETARCH-rocm && tar cf - . | ${GZIP} --best > ../ollama-linux-$TARGETARCH-rocm.tgz )
-    fi
-done
+
+# buildx behavior changes for single vs. multiplatform
+if echo $PLATFORM | grep "," > /dev/null ; then 
+        mv -f ./dist/linux_*64/ollama* ./dist/
+        rmdir ./dist/linux_*64
+fi

+ 14 - 0
scripts/env.sh

@@ -0,0 +1,14 @@
+# Common environment setup across build*.sh scripts
+
+export VERSION=${VERSION:-$(git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")}
+export GOFLAGS="'-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$VERSION\" \"-X=github.com/ollama/ollama/server.mode=release\"'"
+# TODO - consider `docker buildx ls --format=json` to autodiscover platform capability
+PLATFORM=${PLATFORM:-"linux/arm64,linux/amd64"}
+DOCKER_ORG=${DOCKER_ORG:-"ollama"}
+RELEASE_IMAGE_REPO=${RELEASE_IMAGE_REPO:-"${DOCKER_ORG}/release"}
+FINAL_IMAGE_REPO=${FINAL_IMAGE_REPO:-"${DOCKER_ORG}/ollama"}
+OLLAMA_COMMON_BUILD_ARGS="--build-arg=VERSION --build-arg=GOFLAGS --build-arg=OLLAMA_CUSTOM_CPU_DEFS --build-arg=AMDGPU_TARGETS"
+
+echo "Building Ollama"
+echo "VERSION=$VERSION"
+echo "PLATFORM=$PLATFORM"

+ 5 - 3
server/routes.go

@@ -26,11 +26,13 @@ import (
 	"golang.org/x/sync/errgroup"
 	"golang.org/x/sync/errgroup"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
+	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/template"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/errtypes"
 	"github.com/ollama/ollama/types/model"
 	"github.com/ollama/ollama/types/model"
@@ -1216,12 +1218,12 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		srvr.Close()
 		schedDone()
 		schedDone()
 		sched.unloadAllRunners()
 		sched.unloadAllRunners()
-		gpu.Cleanup()
+		runners.Cleanup(build.EmbedFS)
 		done()
 		done()
 	}()
 	}()
 
 
-	if err := llm.Init(); err != nil {
-		return fmt.Errorf("unable to initialize llm library %w", err)
+	if _, err := runners.Refresh(build.EmbedFS); err != nil {
+		return fmt.Errorf("unable to initialize llm runners %w", err)
 	}
 	}
 
 
 	s.sched.Run(schedCtx)
 	s.sched.Run(schedCtx)