1 년 전 · 58d95cc9bd
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -56,10 +56,12 @@ jobs:
 
				       - run: go get ./...
			
 
				       - run: |
			
 
				           $gopath=(get-command go).source | split-path -parent
			
 
				+          $gccpath=(get-command gcc).source | split-path -parent
			
 
				           & "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Launch-VsDevShell.ps1"
			
 
				           cd $env:GITHUB_WORKSPACE
			
 
				           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
			
 
				-          $env:PATH="$gopath;$env:PATH"
			
 
				+          $env:PATH="$gopath;$gccpath;$env:PATH"
			
 
				+          echo $env:PATH
			
 
				           go generate -x ./...
			
 
				         if: ${{ startsWith(matrix.os, 'windows-') }}
			
 
				         name: "Windows Go Generate"
			
@@ -69,7 +71,9 @@ jobs:
 
				       - uses: actions/upload-artifact@v4
			
 
				         with:
			
 
				           name: ${{ matrix.os }}-${{ matrix.arch }}-libraries
			
 
				-          path: llm/llama.cpp/build/**/lib/*
			
 
				+          path: |
			
 
				+            llm/build/**/bin/*
			
 
				+            llm/build/**/*.a
			
 
				   generate-cuda:
			
 
				     needs: [changes]
			
 
				     if: ${{ needs.changes.outputs.GENERATE_CUDA == 'True' }}
			
@@ -100,7 +104,7 @@ jobs:
 
				       - uses: actions/upload-artifact@v4
			
 
				         with:
			
 
				           name: cuda-${{ matrix.cuda-version }}-libraries
			
 
				-          path: llm/llama.cpp/build/**/lib/*
			
 
				+          path: llm/build/**/bin/*
			
 
				   generate-rocm:
			
 
				     needs: [changes]
			
 
				     if: ${{ needs.changes.outputs.GENERATE_ROCM == 'True' }}
			
@@ -131,7 +135,7 @@ jobs:
 
				       - uses: actions/upload-artifact@v4
			
 
				         with:
			
 
				           name: rocm-${{ matrix.rocm-version }}-libraries
			
 
				-          path: llm/llama.cpp/build/**/lib/*
			
 
				+          path: llm/build/**/lib/*
			
 
				 
			
 
				   # ROCm generation step
			
 
				   generate-windows-rocm:
			
@@ -244,17 +248,17 @@ jobs:
 
				           esac >>$GITHUB_ENV
			
 
				         shell: bash
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
			
 
				-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
			
 
				+          mkdir -p llm/build/linux/$ARCH/stub/bin/
			
 
				+          touch llm/build/linux/$ARCH/stub/bin/stub.so
			
 
				         if: ${{ startsWith(matrix.os, 'ubuntu-') }}
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
			
 
				-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
			
 
				-          touch llm/llama.cpp/ggml-metal.metal
			
 
				+          mkdir -p llm/build/darwin/$ARCH/stub/bin/
			
 
				+          touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
			
 
				+          touch llm/ggml-metal.metal
			
 
				         if: ${{ startsWith(matrix.os, 'macos-') }}
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
			
 
				-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
			
 
				+          mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
			
 
				+          touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
			
 
				         if: ${{ startsWith(matrix.os, 'windows-') }}
			
 
				       - uses: golangci/golangci-lint-action@v3
			
 
				   test:
			
@@ -271,6 +275,7 @@ jobs:
 
				     env:
			
 
				       GOARCH: ${{ matrix.arch }}
			
 
				       CGO_ENABLED: '1'
			
 
				+      OLLAMA_CPU_TARGET: "static"
			
 
				     steps:
			
 
				       - uses: actions/checkout@v4
			
 
				         with:
			
@@ -287,18 +292,19 @@ jobs:
 
				           esac >>$GITHUB_ENV
			
 
				         shell: bash
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/linux/$ARCH/stub/lib/
			
 
				-          touch llm/llama.cpp/build/linux/$ARCH/stub/lib/stub.so
			
 
				+          mkdir -p llm/build/linux/$ARCH/stub/bin/
			
 
				+          touch llm//build/linux/$ARCH/stub/bin/stub.so
			
 
				         if: ${{ startsWith(matrix.os, 'ubuntu-') }}
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/darwin/$ARCH/stub/lib/
			
 
				-          touch llm/llama.cpp/build/darwin/$ARCH/stub/lib/stub.dylib
			
 
				-          touch llm/llama.cpp/ggml-metal.metal
			
 
				+          mkdir -p llm/build/darwin/$ARCH/stub/bin/
			
 
				+          touch llm/build/darwin/$ARCH/stub/bin/stub.dylib
			
 
				+          touch llm/ggml-metal.metal
			
 
				         if: ${{ startsWith(matrix.os, 'macos-') }}
			
 
				       - run: |
			
 
				-          mkdir -p llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/
			
 
				-          touch llm/llama.cpp/build/windows/$ARCH/stub/stub/lib/stub.dll
			
 
				+          mkdir -p llm/build/windows/$ARCH/stub/stub/bin/
			
 
				+          touch llm/build/windows/$ARCH/stub/stub/bin/stub.dll
			
 
				         if: ${{ startsWith(matrix.os, 'windows-') }}
			
 
				+      - run: go generate ./...
			
 
				       - run: go build
			
 
				       - run: go test -v ./...
			
 
				       - uses: actions/upload-artifact@v4
			
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,5 @@ ggml-metal.metal
 
				 *.exe
			
 
				 .idea
			
 
				 test_data
			
 
				-*.crt
			
 
				+*.crt
			
 
				+llm/build
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -61,6 +61,8 @@ ARG OLLAMA_CUSTOM_CPU_DEFS
 
				 ARG CGO_CFLAGS
			
 
				 WORKDIR /go/src/github.com/ollama/ollama/llm/generate
			
 
				 
			
 
				+FROM --platform=linux/amd64 cpu-builder-amd64 AS static-build-amd64
			
 
				+RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
			
 
				 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu-build-amd64
			
 
				 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
			
 
				 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx-build-amd64
			
@@ -68,28 +70,33 @@ RUN OLLAMA_CPU_TARGET="cpu_avx" sh gen_linux.sh
 
				 FROM --platform=linux/amd64 cpu-builder-amd64 AS cpu_avx2-build-amd64
			
 
				 RUN OLLAMA_CPU_TARGET="cpu_avx2" sh gen_linux.sh
			
 
				 
			
 
				-FROM --platform=linux/arm64 centos:7 AS cpu-build-arm64
			
 
				+FROM --platform=linux/arm64 centos:7 AS cpu-builder-arm64
			
 
				 ARG CMAKE_VERSION
			
 
				 ARG GOLANG_VERSION
			
 
				 COPY ./scripts/rh_linux_deps.sh /
			
 
				 RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
			
 
				 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
			
 
				 COPY --from=llm-code / /go/src/github.com/ollama/ollama/
			
 
				-WORKDIR /go/src/github.com/ollama/ollama/llm/generate
			
 
				-# Note, we only build the "base" CPU variant on arm since avx/avx2 are x86 features
			
 
				 ARG OLLAMA_CUSTOM_CPU_DEFS
			
 
				 ARG CGO_CFLAGS
			
 
				+WORKDIR /go/src/github.com/ollama/ollama/llm/generate
			
 
				+
			
 
				+FROM --platform=linux/arm64 cpu-builder-arm64 AS static-build-arm64
			
 
				+RUN OLLAMA_CPU_TARGET="static" sh gen_linux.sh
			
 
				+FROM --platform=linux/arm64 cpu-builder-arm64 AS cpu-build-arm64
			
 
				 RUN OLLAMA_CPU_TARGET="cpu" sh gen_linux.sh
			
 
				 
			
 
				+
			
 
				 # Intermediate stage used for ./scripts/build_linux.sh
			
 
				 FROM --platform=linux/amd64 cpu-build-amd64 AS build-amd64
			
 
				 ENV CGO_ENABLED 1
			
 
				 WORKDIR /go/src/github.com/ollama/ollama
			
 
				 COPY . .
			
 
				-COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
			
 
				-COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
			
 
				-COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
			
 
				-COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
			
 
				+COPY --from=static-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cpu_avx-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cpu_avx2-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cuda-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 COPY --from=rocm-build-amd64 /go/src/github.com/ollama/ollama/dist/deps/ ./dist/deps/
			
 
				 ARG GOFLAGS
			
 
				 ARG CGO_CFLAGS
			
@@ -101,8 +108,8 @@ ENV CGO_ENABLED 1
 
				 ARG GOLANG_VERSION
			
 
				 WORKDIR /go/src/github.com/ollama/ollama
			
 
				 COPY . .
			
 
				-COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/llama.cpp/build/linux/ llm/llama.cpp/build/linux/
			
 
				-RUN mkdir -p /go/src/github.com/ollama/ollama/dist/deps/
			
 
				+COPY --from=static-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				+COPY --from=cuda-build-arm64 /go/src/github.com/ollama/ollama/llm/build/linux/ llm/build/linux/
			
 
				 ARG GOFLAGS
			
 
				 ARG CGO_CFLAGS
			
 
				 RUN go build -trimpath .
			
--- a/app/lifecycle/server.go
+++ b/app/lifecycle/server.go
@@ -9,6 +9,7 @@ import (
 
				 	"os"
			
 
				 	"os/exec"
			
 
				 	"path/filepath"
			
 
				+	"syscall"
			
 
				 	"time"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
@@ -83,6 +84,28 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 
				 		io.Copy(logFile, stderr) //nolint:errcheck
			
 
				 	}()
			
 
				 
			
 
				+	// Re-wire context done behavior to attempt a graceful shutdown of the server
			
 
				+	cmd.Cancel = func() error {
			
 
				+		if cmd.Process != nil {
			
 
				+			cmd.Process.Signal(os.Interrupt) //nolint:errcheck
			
 
				+			tick := time.NewTicker(10 * time.Millisecond)
			
 
				+			defer tick.Stop()
			
 
				+			for {
			
 
				+				select {
			
 
				+				case <-tick.C:
			
 
				+					// OS agnostic "is it still running"
			
 
				+					if proc, err := os.FindProcess(int(cmd.Process.Pid)); err != nil || errors.Is(proc.Signal(syscall.Signal(0)), os.ErrProcessDone) {
			
 
				+						return nil //nolint:nilerr
			
 
				+					}
			
 
				+				case <-time.After(5 * time.Second):
			
 
				+					slog.Warn("graceful server shutdown timeout, killing", "pid", cmd.Process.Pid)
			
 
				+					cmd.Process.Kill() //nolint:errcheck
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+		return nil
			
 
				+	}
			
 
				+
			
 
				 	// run the command and wait for it to finish
			
 
				 	if err := cmd.Start(); err != nil {
			
 
				 		return done, fmt.Errorf("failed to start server %w", err)
			
@@ -105,7 +128,7 @@ func SpawnServer(ctx context.Context, command string) (chan int, error) {
 
				 
			
 
				 			select {
			
 
				 			case <-ctx.Done():
			
 
				-				slog.Debug(fmt.Sprintf("server shutdown with exit code %d", code))
			
 
				+				slog.Info(fmt.Sprintf("server shutdown with exit code %d", code))
			
 
				 				done <- code
			
 
				 				return
			
 
				 			default:
			
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -100,6 +100,8 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				+	updateLibPath(libDir)
			
 
				+
			
 
				 	gfxOverride := os.Getenv("HSA_OVERRIDE_GFX_VERSION")
			
 
				 	if gfxOverride == "" {
			
 
				 		supported, err := GetSupportedGFX(libDir)
			
@@ -143,6 +145,21 @@ func AMDGetGPUInfo(resp *GpuInfo) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+func updateLibPath(libDir string) {
			
 
				+	ldPaths := []string{}
			
 
				+	if val, ok := os.LookupEnv("LD_LIBRARY_PATH"); ok {
			
 
				+		ldPaths = strings.Split(val, ":")
			
 
				+	}
			
 
				+	for _, d := range ldPaths {
			
 
				+		if d == libDir {
			
 
				+			return
			
 
				+		}
			
 
				+	}
			
 
				+	val := strings.Join(append(ldPaths, libDir), ":")
			
 
				+	slog.Debug("updated lib path", "LD_LIBRARY_PATH", val)
			
 
				+	os.Setenv("LD_LIBRARY_PATH", val)
			
 
				+}
			
 
				+
			
 
				 // Walk the sysfs nodes for the available GPUs and gather information from them
			
 
				 // skipping over any devices in the skip map
			
 
				 func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
			
--- a/gpu/assets.go
+++ b/gpu/assets.go
@@ -11,6 +11,7 @@ import (
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"syscall"
			
 
				+	"time"
			
 
				 )
			
 
				 
			
 
				 var (
			
@@ -84,7 +85,12 @@ func Cleanup() {
 
				 		slog.Debug("cleaning up", "dir", tmpDir)
			
 
				 		err := os.RemoveAll(tmpDir)
			
 
				 		if err != nil {
			
 
				-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
			
 
				+			// On windows, if we remove too quickly the llama.dll may still be in-use and fail to remove
			
 
				+			time.Sleep(1000 * time.Millisecond)
			
 
				+			err = os.RemoveAll(tmpDir)
			
 
				+			if err != nil {
			
 
				+				slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -1,142 +0,0 @@
 
				-#include "dyn_ext_server.h"
			
 
				-
			
 
				-#include <stdio.h>
			
 
				-#include <string.h>
			
 
				-
			
 
				-#ifdef __linux__
			
 
				-#include <dlfcn.h>
			
 
				-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
			
 
				-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
			
 
				-#define LOAD_ERR() strdup(dlerror())
			
 
				-#define UNLOAD_LIBRARY(handle) dlclose(handle)
			
 
				-#elif _WIN32
			
 
				-#include <windows.h>
			
 
				-#define LOAD_LIBRARY(lib, flags) LoadLibrary(lib)
			
 
				-#define LOAD_SYMBOL(handle, sym) GetProcAddress(handle, sym)
			
 
				-#define UNLOAD_LIBRARY(handle) FreeLibrary(handle)
			
 
				-#define LOAD_ERR() ({\
			
 
				-  LPSTR messageBuffer = NULL; \
			
 
				-  size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, \
			
 
				-                                 NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); \
			
 
				-  char *resp = strdup(messageBuffer); \
			
 
				-  LocalFree(messageBuffer); \
			
 
				-  resp; \
			
 
				-})
			
 
				-#else
			
 
				-#include <dlfcn.h>
			
 
				-#define LOAD_LIBRARY(lib, flags) dlopen(lib, flags)
			
 
				-#define LOAD_SYMBOL(handle, sym) dlsym(handle, sym)
			
 
				-#define LOAD_ERR() strdup(dlerror())
			
 
				-#define UNLOAD_LIBRARY(handle) dlclose(handle)
			
 
				-#endif
			
 
				-
			
 
				-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				-                       ext_server_resp_t *err) {
			
 
				-  int i = 0;
			
 
				-  struct lookup {
			
 
				-    char *s;
			
 
				-    void **p;
			
 
				-  } l[] = {
			
 
				-      {"llama_server_init", (void *)&s->llama_server_init},
			
 
				-      {"llama_server_start", (void *)&s->llama_server_start},
			
 
				-      {"llama_server_stop", (void *)&s->llama_server_stop},
			
 
				-      {"llama_server_completion", (void *)&s->llama_server_completion},
			
 
				-      {"llama_server_completion_next_result",
			
 
				-       (void *)&s->llama_server_completion_next_result},
			
 
				-      {"llama_server_completion_cancel",
			
 
				-       (void *)&s->llama_server_completion_cancel},
			
 
				-      {"llama_server_release_task_result",
			
 
				-       (void *)&s->llama_server_release_task_result},
			
 
				-      {"llama_server_tokenize", (void *)&s->llama_server_tokenize},
			
 
				-      {"llama_server_detokenize", (void *)&s->llama_server_detokenize},
			
 
				-      {"llama_server_embedding", (void *)&s->llama_server_embedding},
			
 
				-      {"llama_server_release_json_resp",
			
 
				-       (void *)&s->llama_server_release_json_resp},
			
 
				-      {"", NULL},
			
 
				-  };
			
 
				-
			
 
				-  printf("loading library %s\n", libPath);
			
 
				-  s->handle = LOAD_LIBRARY(libPath, RTLD_LOCAL|RTLD_NOW);
			
 
				-  if (!s->handle) {
			
 
				-    err->id = -1;
			
 
				-    char *msg = LOAD_ERR();
			
 
				-    snprintf(err->msg, err->msg_len,
			
 
				-             "Unable to load dynamic server library: %s", msg);
			
 
				-    free(msg);
			
 
				-    return;
			
 
				-  }
			
 
				-
			
 
				-  for (i = 0; l[i].p != NULL; i++) {
			
 
				-    *l[i].p = LOAD_SYMBOL(s->handle, l[i].s);
			
 
				-    if (!l[i].p) {
			
 
				-      UNLOAD_LIBRARY(s->handle);
			
 
				-      err->id = -1;
			
 
				-      char *msg = LOAD_ERR();
			
 
				-      snprintf(err->msg, err->msg_len, "symbol lookup for %s failed: %s",
			
 
				-               l[i].s, msg);
			
 
				-      free(msg);
			
 
				-      return;
			
 
				-    }
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_init(struct dynamic_llama_server s,
			
 
				-                                           ext_server_params_t *sparams,
			
 
				-                                           ext_server_resp_t *err) {
			
 
				-  s.llama_server_init(sparams, err);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_start(struct dynamic_llama_server s) {
			
 
				-  s.llama_server_start();
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
			
 
				-  s.llama_server_stop();
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_completion(struct dynamic_llama_server s,
			
 
				-                                                 const char *json_req,
			
 
				-                                                 ext_server_resp_t *resp) {
			
 
				-  s.llama_server_completion(json_req, resp);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_completion_next_result(
			
 
				-    struct dynamic_llama_server s, const int task_id,
			
 
				-    ext_server_task_result_t *result) {
			
 
				-  s.llama_server_completion_next_result(task_id, result);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_completion_cancel(
			
 
				-    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
			
 
				-  s.llama_server_completion_cancel(task_id, err);
			
 
				-}
			
 
				-inline void dyn_llama_server_release_task_result(
			
 
				-    struct dynamic_llama_server s, ext_server_task_result_t *result) {
			
 
				-  s.llama_server_release_task_result(result);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				-                                               const char *json_req,
			
 
				-                                               char **json_resp,
			
 
				-                                               ext_server_resp_t *err) {
			
 
				-  s.llama_server_tokenize(json_req, json_resp, err);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				-                                                 const char *json_req,
			
 
				-                                                 char **json_resp,
			
 
				-                                                 ext_server_resp_t *err) {
			
 
				-  s.llama_server_detokenize(json_req, json_resp, err);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
			
 
				-                                                const char *json_req,
			
 
				-                                                char **json_resp,
			
 
				-                                                ext_server_resp_t *err) {
			
 
				-  s.llama_server_embedding(json_req, json_resp, err);
			
 
				-}
			
 
				-
			
 
				-inline void dyn_llama_server_release_json_resp(
			
 
				-    struct dynamic_llama_server s, char **json_resp) {
			
 
				-  s.llama_server_release_json_resp(json_resp);
			
 
				-}
			
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -1,388 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-/*
			
 
				-#cgo CFLAGS: -I${SRCDIR}/ext_server -I${SRCDIR}/llama.cpp -I${SRCDIR}/llama.cpp/common -I${SRCDIR}/llama.cpp/examples/server
			
 
				-#cgo CFLAGS: -DNDEBUG -DLLAMA_SERVER_LIBRARY=1 -D_XOPEN_SOURCE=600 -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
			
 
				-#cgo CFLAGS: -Wmissing-noreturn -Wextra -Wcast-qual -Wno-unused-function -Wno-array-bounds
			
 
				-#cgo CPPFLAGS: -Ofast -Wextra -Wno-unused-function -Wno-unused-variable -Wno-deprecated-declarations
			
 
				-#cgo darwin CFLAGS: -D_DARWIN_C_SOURCE
			
 
				-#cgo darwin CPPFLAGS:  -DGGML_USE_ACCELERATE
			
 
				-#cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
			
 
				-#cgo darwin LDFLAGS: -lc++ -framework Accelerate
			
 
				-#cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
			
 
				-#cgo linux CFLAGS: -D_GNU_SOURCE
			
 
				-#cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
			
 
				-#cgo linux windows LDFLAGS: -lpthread
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-#include "dyn_ext_server.h"
			
 
				-
			
 
				-*/
			
 
				-import "C"
			
 
				-
			
 
				-import (
			
 
				-	"bytes"
			
 
				-	"context"
			
 
				-	"encoding/json"
			
 
				-	"fmt"
			
 
				-	"log/slog"
			
 
				-	"os"
			
 
				-	"path/filepath"
			
 
				-	"strings"
			
 
				-	"sync"
			
 
				-	"time"
			
 
				-	"unsafe"
			
 
				-
			
 
				-	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				-)
			
 
				-
			
 
				-type dynExtServer struct {
			
 
				-	s       C.struct_dynamic_llama_server
			
 
				-	options *api.Options
			
 
				-}
			
 
				-
			
 
				-// Note: current implementation does not support concurrent instantiations
			
 
				-var mutex sync.Mutex
			
 
				-
			
 
				-func newExtServerResp(len C.size_t) C.ext_server_resp_t {
			
 
				-	var resp C.ext_server_resp_t
			
 
				-	resp.msg_len = len
			
 
				-	bytes := make([]byte, len)
			
 
				-	resp.msg = (*C.char)(C.CBytes(bytes))
			
 
				-	return resp
			
 
				-}
			
 
				-
			
 
				-func freeExtServerResp(resp C.ext_server_resp_t) {
			
 
				-	if resp.msg_len == 0 {
			
 
				-		return
			
 
				-	}
			
 
				-	C.free(unsafe.Pointer(resp.msg))
			
 
				-}
			
 
				-
			
 
				-func extServerResponseToErr(resp C.ext_server_resp_t) error {
			
 
				-	return fmt.Errorf(C.GoString(resp.msg))
			
 
				-}
			
 
				-
			
 
				-func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				-	if !mutex.TryLock() {
			
 
				-		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
			
 
				-		mutex.Lock()
			
 
				-	}
			
 
				-	gpu.UpdatePath(filepath.Dir(library))
			
 
				-	libPath := C.CString(library)
			
 
				-	defer C.free(unsafe.Pointer(libPath))
			
 
				-	resp := newExtServerResp(512)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	var srv C.struct_dynamic_llama_server
			
 
				-	C.dyn_init(libPath, &srv, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		mutex.Unlock()
			
 
				-		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
			
 
				-	}
			
 
				-	llm := dynExtServer{
			
 
				-		s:       srv,
			
 
				-		options: opts,
			
 
				-	}
			
 
				-	slog.Info(fmt.Sprintf("Loading Dynamic llm server: %s", library))
			
 
				-
			
 
				-	var sparams C.ext_server_params_t
			
 
				-	sparams.model = C.CString(model)
			
 
				-	defer C.free(unsafe.Pointer(sparams.model))
			
 
				-
			
 
				-	sparams.embedding = true
			
 
				-	sparams.n_ctx = C.uint(opts.NumCtx)
			
 
				-	sparams.n_batch = C.uint(opts.NumBatch)
			
 
				-	sparams.n_gpu_layers = C.int(opts.NumGPU)
			
 
				-	sparams.main_gpu = C.int(opts.MainGPU)
			
 
				-	sparams.n_parallel = 1 // TODO - wire up concurrency
			
 
				-
			
 
				-	// Always use the value encoded in the model
			
 
				-	sparams.rope_freq_base = 0.0
			
 
				-	sparams.rope_freq_scale = 0.0
			
 
				-	sparams.memory_f16 = C.bool(opts.F16KV)
			
 
				-	sparams.use_mlock = C.bool(opts.UseMLock)
			
 
				-	sparams.use_mmap = C.bool(opts.UseMMap)
			
 
				-
			
 
				-	if opts.UseNUMA {
			
 
				-		sparams.numa = C.int(1)
			
 
				-	} else {
			
 
				-		sparams.numa = C.int(0)
			
 
				-	}
			
 
				-
			
 
				-	sparams.lora_adapters = nil
			
 
				-	for i := 0; i < len(adapters); i++ {
			
 
				-		la := (*C.ext_server_lora_adapter_t)(C.malloc(C.sizeof_ext_server_lora_adapter_t))
			
 
				-		defer C.free(unsafe.Pointer(la))
			
 
				-		la.adapter = C.CString(adapters[i])
			
 
				-		defer C.free(unsafe.Pointer(la.adapter))
			
 
				-		la.scale = C.float(1.0) // TODO expose scale/weights up through ollama UX
			
 
				-		la.next = nil
			
 
				-		if i == 0 {
			
 
				-			sparams.lora_adapters = la
			
 
				-		} else {
			
 
				-			tmp := sparams.lora_adapters
			
 
				-			for ; tmp.next != nil; tmp = tmp.next {
			
 
				-			}
			
 
				-			tmp.next = la
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	if len(projectors) > 0 {
			
 
				-		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
			
 
				-		sparams.mmproj = C.CString(projectors[0])
			
 
				-		defer C.free(unsafe.Pointer(sparams.mmproj))
			
 
				-	} else {
			
 
				-		sparams.mmproj = nil
			
 
				-	}
			
 
				-
			
 
				-	sparams.n_threads = C.uint(opts.NumThread)
			
 
				-
			
 
				-	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
			
 
				-		sparams.verbose_logging = C.bool(true)
			
 
				-	} else {
			
 
				-		sparams.verbose_logging = C.bool(false)
			
 
				-	}
			
 
				-
			
 
				-	slog.Info("Initializing llama server")
			
 
				-	slog.Debug(fmt.Sprintf("server params: %+v", sparams))
			
 
				-	initResp := newExtServerResp(512)
			
 
				-	defer freeExtServerResp(initResp)
			
 
				-	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
			
 
				-	if initResp.id < 0 {
			
 
				-		mutex.Unlock()
			
 
				-		err := extServerResponseToErr(initResp)
			
 
				-		slog.Debug(fmt.Sprintf("failure during initialization: %s", err))
			
 
				-		return nil, err
			
 
				-	}
			
 
				-
			
 
				-	slog.Info("Starting llama main loop")
			
 
				-	C.dyn_llama_server_start(llm.s)
			
 
				-	return &llm, nil
			
 
				-}
			
 
				-
			
 
				-func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-
			
 
				-	if len(predict.Images) > 0 {
			
 
				-		slog.Info(fmt.Sprintf("loaded %d images", len(predict.Images)))
			
 
				-	}
			
 
				-
			
 
				-	request := map[string]any{
			
 
				-		"prompt":            predict.Prompt,
			
 
				-		"stream":            true,
			
 
				-		"n_predict":         predict.Options.NumPredict,
			
 
				-		"n_keep":            predict.Options.NumKeep,
			
 
				-		"temperature":       predict.Options.Temperature,
			
 
				-		"top_k":             predict.Options.TopK,
			
 
				-		"top_p":             predict.Options.TopP,
			
 
				-		"tfs_z":             predict.Options.TFSZ,
			
 
				-		"typical_p":         predict.Options.TypicalP,
			
 
				-		"repeat_last_n":     predict.Options.RepeatLastN,
			
 
				-		"repeat_penalty":    predict.Options.RepeatPenalty,
			
 
				-		"presence_penalty":  predict.Options.PresencePenalty,
			
 
				-		"frequency_penalty": predict.Options.FrequencyPenalty,
			
 
				-		"mirostat":          predict.Options.Mirostat,
			
 
				-		"mirostat_tau":      predict.Options.MirostatTau,
			
 
				-		"mirostat_eta":      predict.Options.MirostatEta,
			
 
				-		"penalize_nl":       predict.Options.PenalizeNewline,
			
 
				-		"seed":              predict.Options.Seed,
			
 
				-		"stop":              predict.Options.Stop,
			
 
				-		"image_data":        predict.Images,
			
 
				-		"cache_prompt":      true,
			
 
				-	}
			
 
				-
			
 
				-	if predict.Format == "json" {
			
 
				-		request["grammar"] = jsonGrammar
			
 
				-		if !strings.Contains(strings.ToLower(predict.Prompt), "json") {
			
 
				-			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	retryDelay := 100 * time.Microsecond
			
 
				-	for retries := 0; retries < maxRetries; retries++ {
			
 
				-		if retries > 0 {
			
 
				-			time.Sleep(retryDelay) // wait before retrying
			
 
				-			retryDelay *= 2        // exponential backoff
			
 
				-		}
			
 
				-
			
 
				-		// Handling JSON marshaling with special characters unescaped.
			
 
				-		buffer := &bytes.Buffer{}
			
 
				-		enc := json.NewEncoder(buffer)
			
 
				-		enc.SetEscapeHTML(false)
			
 
				-
			
 
				-		if err := enc.Encode(request); err != nil {
			
 
				-			return fmt.Errorf("failed to marshal data: %w", err)
			
 
				-		}
			
 
				-
			
 
				-		req := C.CString(buffer.String())
			
 
				-		defer C.free(unsafe.Pointer(req))
			
 
				-
			
 
				-		C.dyn_llama_server_completion(llm.s, req, &resp)
			
 
				-		if resp.id < 0 {
			
 
				-			return extServerResponseToErr(resp)
			
 
				-		}
			
 
				-
			
 
				-		retryNeeded := false
			
 
				-		// keep track of the last token generated, this is used to abort if the model starts looping
			
 
				-		var lastToken string
			
 
				-		var tokenRepeat int
			
 
				-	out:
			
 
				-		for {
			
 
				-			select {
			
 
				-			case <-ctx.Done():
			
 
				-				return cancelCompletion(llm, resp)
			
 
				-			default:
			
 
				-				var result C.ext_server_task_result_t
			
 
				-				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
			
 
				-				json_resp := C.GoString(result.json_resp)
			
 
				-				C.dyn_llama_server_release_task_result(llm.s, &result)
			
 
				-
			
 
				-				var p prediction
			
 
				-				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
			
 
				-					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
			
 
				-					if resp.id < 0 {
			
 
				-						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
			
 
				-					} else {
			
 
				-						return fmt.Errorf("error unmarshaling llm prediction response: %w", err)
			
 
				-					}
			
 
				-				}
			
 
				-
			
 
				-				if bool(result.error) && strings.Contains(json_resp, "slot unavailable") {
			
 
				-					retryNeeded = true
			
 
				-					// task will already be canceled
			
 
				-					break out
			
 
				-				}
			
 
				-
			
 
				-				switch {
			
 
				-				case strings.TrimSpace(p.Content) == lastToken:
			
 
				-					tokenRepeat++
			
 
				-				default:
			
 
				-					lastToken = strings.TrimSpace(p.Content)
			
 
				-					tokenRepeat = 0
			
 
				-				}
			
 
				-
			
 
				-				// 30 picked as an arbitrary max token repeat limit, modify as needed
			
 
				-				if tokenRepeat > 30 {
			
 
				-					slog.Debug("prediction aborted, token repeat limit reached")
			
 
				-					return cancelCompletion(llm, resp)
			
 
				-				}
			
 
				-
			
 
				-				if p.Content != "" {
			
 
				-					fn(PredictResult{
			
 
				-						Content: p.Content,
			
 
				-					})
			
 
				-				}
			
 
				-
			
 
				-				if p.Stop || bool(result.stop) {
			
 
				-					fn(PredictResult{
			
 
				-						Done:               true,
			
 
				-						PromptEvalCount:    p.Timings.PromptN,
			
 
				-						PromptEvalDuration: parseDurationMs(p.Timings.PromptMS),
			
 
				-						EvalCount:          p.Timings.PredictedN,
			
 
				-						EvalDuration:       parseDurationMs(p.Timings.PredictedMS),
			
 
				-					})
			
 
				-					return nil
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-		if !retryNeeded {
			
 
				-			return nil // success
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// should never reach here ideally
			
 
				-	return fmt.Errorf("max retries exceeded")
			
 
				-}
			
 
				-
			
 
				-func cancelCompletion(llm *dynExtServer, resp C.ext_server_resp_t) error {
			
 
				-	C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return extServerResponseToErr(resp)
			
 
				-	} else {
			
 
				-		return nil
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
			
 
				-	data, err := json.Marshal(TokenizeRequest{Content: prompt})
			
 
				-	if err != nil {
			
 
				-		return nil, fmt.Errorf("marshaling encode data: %w", err)
			
 
				-	}
			
 
				-	req := C.CString(string(data))
			
 
				-	defer C.free(unsafe.Pointer(req))
			
 
				-	var json_resp *C.char
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return nil, extServerResponseToErr(resp)
			
 
				-	}
			
 
				-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				-
			
 
				-	var encoded TokenizeResponse
			
 
				-	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
			
 
				-		return nil, fmt.Errorf("unmarshal encode response: %w", err2)
			
 
				-	}
			
 
				-
			
 
				-	return encoded.Tokens, err
			
 
				-}
			
 
				-
			
 
				-func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
			
 
				-	if len(tokens) == 0 {
			
 
				-		return "", nil
			
 
				-	}
			
 
				-	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
			
 
				-	if err != nil {
			
 
				-		return "", fmt.Errorf("marshaling decode data: %w", err)
			
 
				-	}
			
 
				-
			
 
				-	req := C.CString(string(data))
			
 
				-	defer C.free(unsafe.Pointer(req))
			
 
				-	var json_resp *C.char
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return "", extServerResponseToErr(resp)
			
 
				-	}
			
 
				-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				-
			
 
				-	var decoded DetokenizeResponse
			
 
				-	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
			
 
				-		return "", fmt.Errorf("unmarshal encode response: %w", err2)
			
 
				-	}
			
 
				-
			
 
				-	return decoded.Content, err
			
 
				-}
			
 
				-
			
 
				-func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
			
 
				-	data, err := json.Marshal(TokenizeRequest{Content: input})
			
 
				-	if err != nil {
			
 
				-		return nil, fmt.Errorf("error marshaling embed data: %w", err)
			
 
				-	}
			
 
				-
			
 
				-	req := C.CString(string(data))
			
 
				-	defer C.free(unsafe.Pointer(req))
			
 
				-	var json_resp *C.char
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return nil, extServerResponseToErr(resp)
			
 
				-	}
			
 
				-	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				-
			
 
				-	var embedding EmbeddingResponse
			
 
				-	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
			
 
				-		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
			
 
				-	}
			
 
				-
			
 
				-	return embedding.Embedding, nil
			
 
				-}
			
 
				-
			
 
				-func (llm *dynExtServer) Close() {
			
 
				-	C.dyn_llama_server_stop(llm.s)
			
 
				-	mutex.Unlock()
			
 
				-}
			
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -1,74 +0,0 @@
 
				-#include <stdlib.h>
			
 
				-
			
 
				-#include "ext_server.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-struct dynamic_llama_server {
			
 
				-  void *handle;
			
 
				-  void (*llama_server_init)(ext_server_params_t *sparams,
			
 
				-                            ext_server_resp_t *err);
			
 
				-  void (*llama_server_start)();
			
 
				-  void (*llama_server_stop)();
			
 
				-  void (*llama_server_completion)(const char *json_req,
			
 
				-                                  ext_server_resp_t *resp);
			
 
				-  void (*llama_server_completion_next_result)(const int task_id,
			
 
				-                                              ext_server_task_result_t *result);
			
 
				-  void (*llama_server_completion_cancel)(const int task_id,
			
 
				-                                         ext_server_resp_t *err);
			
 
				-  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
			
 
				-  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
			
 
				-                                ext_server_resp_t *err);
			
 
				-  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
			
 
				-                                  ext_server_resp_t *err);
			
 
				-  void (*llama_server_embedding)(const char *json_req, char **json_resp,
			
 
				-                                 ext_server_resp_t *err);
			
 
				-  void (*llama_server_release_json_resp)(char **json_resp);
			
 
				-};
			
 
				-
			
 
				-void dyn_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				-                       ext_server_resp_t *err);
			
 
				-
			
 
				-// No good way to call C function pointers from Go so inline the indirection
			
 
				-void dyn_llama_server_init(struct dynamic_llama_server s,
			
 
				-                                    ext_server_params_t *sparams,
			
 
				-                                    ext_server_resp_t *err);
			
 
				-
			
 
				-void dyn_llama_server_start(struct dynamic_llama_server s);
			
 
				-
			
 
				-void dyn_llama_server_stop(struct dynamic_llama_server s);
			
 
				-
			
 
				-void dyn_llama_server_completion(struct dynamic_llama_server s,
			
 
				-                                          const char *json_req,
			
 
				-                                          ext_server_resp_t *resp);
			
 
				-
			
 
				-void dyn_llama_server_completion_next_result(
			
 
				-    struct dynamic_llama_server s, const int task_id,
			
 
				-    ext_server_task_result_t *result);
			
 
				-
			
 
				-void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
			
 
				-                                                 const int task_id,
			
 
				-                                                 ext_server_resp_t *err);
			
 
				-
			
 
				-void dyn_llama_server_release_task_result(
			
 
				-    struct dynamic_llama_server s, ext_server_task_result_t *result);
			
 
				-
			
 
				-void dyn_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				-                                        const char *json_req, char **json_resp,
			
 
				-                                        ext_server_resp_t *err);
			
 
				-
			
 
				-void dyn_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				-                                          const char *json_req,
			
 
				-                                          char **json_resp,
			
 
				-                                          ext_server_resp_t *err);
			
 
				-
			
 
				-void dyn_llama_server_embedding(struct dynamic_llama_server s,
			
 
				-                                         const char *json_req, char **json_resp,
			
 
				-                                         ext_server_resp_t *err);
			
 
				-void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
			
 
				-                                                 char **json_resp);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}
			
 
				-#endif
			
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,21 +1,14 @@
 
				 
			
 
				-set(TARGET ext_server)
			
 
				+set(TARGET ollama_llama_server)
			
 
				 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
			
 
				+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
			
 
				+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
			
 
				+install(TARGETS ${TARGET} RUNTIME)
			
 
				+target_compile_definitions(${TARGET} PRIVATE
			
 
				+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
			
 
				+)
			
 
				+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
			
 
				 if (WIN32)
			
 
				-    add_library(${TARGET} SHARED ext_server.cpp ../llama.cpp/llama.cpp)
			
 
				-else()
			
 
				-    add_library(${TARGET} STATIC ext_server.cpp ../llama.cpp/llama.cpp)
			
 
				+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
			
 
				 endif()
			
 
				-target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
 
				-target_compile_definitions(${TARGET} PUBLIC LLAMA_SERVER_LIBRARY=1)
			
 
				-target_link_libraries(${TARGET} PRIVATE ggml llava common )
			
 
				-set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				-target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>)
			
 
				-install(TARGETS ext_server LIBRARY)
			
 
				-
			
 
				-if (CUDAToolkit_FOUND)
			
 
				-    target_include_directories(${TARGET} PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
			
 
				-    if (WIN32)
			
 
				-        target_link_libraries(${TARGET} PRIVATE nvml)
			
 
				-    endif()
			
 
				-endif()
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/llm/ext_server/README.md
+++ b/llm/ext_server/README.md
@@ -1,18 +0,0 @@
 
				-# Extern C Server
			
 
				-
			
 
				-This directory contains a thin facade we layer on top of the Llama.cpp server to
			
 
				-expose `extern C` interfaces to access the functionality through direct API
			
 
				-calls in-process.  The llama.cpp code uses compile time macros to configure GPU
			
 
				-type along with other settings.  During the `go generate ./...` execution, the
			
 
				-build will generate one or more copies of the llama.cpp `extern C` server based
			
 
				-on what GPU libraries are detected to support multiple GPU types as well as CPU
			
 
				-only support. The Ollama go build then embeds these different servers to support
			
 
				-different GPUs and settings at runtime.
			
 
				-
			
 
				-If you are making changes to the code in this directory, make sure to disable
			
 
				-caching during your go build to ensure you pick up your changes.  A typical
			
 
				-iteration cycle from the top of the source tree looks like:
			
 
				-
			
 
				-```
			
 
				-go generate ./... && go build -a .
			
 
				-```
			
--- a/llm/ext_server/ext_server.cpp
+++ b/llm/ext_server/ext_server.cpp
@@ -1,377 +0,0 @@
 
				-#include "ext_server.h"
			
 
				-#include <atomic>
			
 
				-
			
 
				-// Necessary evil since the server types are not defined in a header
			
 
				-#include "server.cpp"
			
 
				-
			
 
				-// Low level API access to verify GPU access
			
 
				-#if defined(GGML_USE_CUBLAS)
			
 
				-#if defined(GGML_USE_HIPBLAS)
			
 
				-#include <hip/hip_runtime.h>
			
 
				-#include <hipblas/hipblas.h>
			
 
				-#include <hip/hip_fp16.h>
			
 
				-#ifdef __HIP_PLATFORM_AMD__
			
 
				-// for rocblas_initialize()
			
 
				-#include "rocblas/rocblas.h"
			
 
				-#endif // __HIP_PLATFORM_AMD__
			
 
				-#define cudaGetDevice hipGetDevice
			
 
				-#define cudaError_t hipError_t
			
 
				-#define cudaSuccess hipSuccess
			
 
				-#define cudaGetErrorString hipGetErrorString
			
 
				-#else
			
 
				-#include <cuda_runtime.h>
			
 
				-#include <cublas_v2.h>
			
 
				-#include <cuda_fp16.h>
			
 
				-#endif // defined(GGML_USE_HIPBLAS)
			
 
				-#endif // GGML_USE_CUBLAS
			
 
				-
			
 
				-// Expose the llama server as a callable extern "C" API
			
 
				-llama_server_context *llama = NULL;
			
 
				-std::thread ext_server_thread;
			
 
				-bool shutting_down = false;
			
 
				-std::atomic_int recv_counter;
			
 
				-
			
 
				-// RAII wrapper for tracking in-flight recv calls
			
 
				-class atomicRecv {
			
 
				-  public:
			
 
				-    atomicRecv(std::atomic<int> &atomic) : atomic(atomic) {
			
 
				-      ++this->atomic;
			
 
				-    }
			
 
				-    ~atomicRecv() {
			
 
				-      --this->atomic;
			
 
				-    }
			
 
				-  private:
			
 
				-    std::atomic<int> &atomic;
			
 
				-};
			
 
				- 
			
 
				-void llama_server_init(ext_server_params *sparams, ext_server_resp_t *err) {
			
 
				-  recv_counter = 0;
			
 
				-  assert(err != NULL && sparams != NULL);
			
 
				-  log_set_target(stderr);
			
 
				-  if (!sparams->verbose_logging) {
			
 
				-    server_verbose = true;
			
 
				-    log_disable();
			
 
				-  }
			
 
				-
			
 
				-  LOG_TEE("system info: %s\n", llama_print_system_info());
			
 
				-  err->id = 0;
			
 
				-  err->msg[0] = '\0';
			
 
				-  try {
			
 
				-    llama = new llama_server_context;
			
 
				-    gpt_params params;
			
 
				-    params.n_ctx = sparams->n_ctx;
			
 
				-    params.n_batch = sparams->n_batch;
			
 
				-    if (sparams->n_threads > 0) {
			
 
				-      params.n_threads = sparams->n_threads;
			
 
				-    }
			
 
				-    params.n_parallel = sparams->n_parallel;
			
 
				-    params.rope_freq_base = sparams->rope_freq_base;
			
 
				-    params.rope_freq_scale = sparams->rope_freq_scale;
			
 
				-
			
 
				-    if (sparams->memory_f16) {
			
 
				-      params.cache_type_k = "f16";
			
 
				-      params.cache_type_v = "f16";
			
 
				-    } else {
			
 
				-      params.cache_type_k = "f32";
			
 
				-      params.cache_type_v = "f32";
			
 
				-    }
			
 
				-
			
 
				-    params.n_gpu_layers = sparams->n_gpu_layers;
			
 
				-    params.main_gpu = sparams->main_gpu;
			
 
				-    params.use_mlock = sparams->use_mlock;
			
 
				-    params.use_mmap = sparams->use_mmap;
			
 
				-    params.numa = (ggml_numa_strategy)sparams->numa;
			
 
				-    params.embedding = sparams->embedding;
			
 
				-    if (sparams->model != NULL) {
			
 
				-      params.model = sparams->model;
			
 
				-    }
			
 
				-
			
 
				-    if (sparams->lora_adapters != NULL) {
			
 
				-      for (ext_server_lora_adapter *la = sparams->lora_adapters; la != NULL;
			
 
				-          la = la->next) {
			
 
				-        params.lora_adapter.push_back(std::make_tuple(la->adapter, la->scale));
			
 
				-      }
			
 
				-
			
 
				-      params.use_mmap = false;
			
 
				-    }
			
 
				-
			
 
				-    if (sparams->mmproj != NULL) {
			
 
				-      params.mmproj = std::string(sparams->mmproj);
			
 
				-    }
			
 
				-
			
 
				-#if defined(GGML_USE_CUBLAS)
			
 
				-    // Before attempting to init the backend which will assert on error, verify the CUDA/ROCM GPU is accessible
			
 
				-    LOG_TEE("Performing pre-initialization of GPU\n");
			
 
				-    int id;
			
 
				-    cudaError_t cudaErr = cudaGetDevice(&id);
			
 
				-    if (cudaErr != cudaSuccess) {
			
 
				-      err->id = -1;
			
 
				-      snprintf(err->msg, err->msg_len, "Unable to init GPU: %s", cudaGetErrorString(cudaErr));
			
 
				-      return;
			
 
				-    }
			
 
				-#endif
			
 
				-
			
 
				-    llama_backend_init();
			
 
				-    llama_numa_init(params.numa);
			
 
				-
			
 
				-  if (!llama->load_model(params)) { 
			
 
				-    // an error occurred that was not thrown
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "error loading model %s", params.model.c_str());
			
 
				-    return;
			
 
				-  }
			
 
				-
			
 
				-    llama->initialize();
			
 
				-  } catch (std::exception &e) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len,
			
 
				-             "Unknown exception initializing llama server");
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void llama_server_start() {
			
 
				-  assert(llama != NULL);
			
 
				-  // TODO mutex to protect thread creation
			
 
				-  ext_server_thread = std::thread([&]() {
			
 
				-    try {
			
 
				-      LOG_TEE("llama server main loop starting\n");
			
 
				-      ggml_time_init();
			
 
				-      llama->queue_tasks.on_new_task(std::bind(
			
 
				-        &llama_server_context::process_single_task, llama, std::placeholders::_1));
			
 
				-      llama->queue_tasks.on_finish_multitask(std::bind(
			
 
				-        &llama_server_context::on_finish_multitask, llama, std::placeholders::_1));
			
 
				-      llama->queue_tasks.on_run_slots(std::bind(
			
 
				-        &llama_server_context::update_slots, llama));
			
 
				-      llama->queue_results.on_multitask_update(std::bind(
			
 
				-          &llama_server_queue::update_multitask,
			
 
				-          &llama->queue_tasks,
			
 
				-          std::placeholders::_1,
			
 
				-          std::placeholders::_2,
			
 
				-          std::placeholders::_3
			
 
				-        ));
			
 
				-      llama->queue_tasks.start_loop();
			
 
				-    } catch (std::exception &e) {
			
 
				-      LOG_TEE("caught exception in llama server main loop: %s\n", e.what());
			
 
				-    } catch (...) {
			
 
				-      LOG_TEE("caught unknown exception in llama server main loop\n");
			
 
				-    }
			
 
				-    LOG_TEE("\nllama server shutting down\n");
			
 
				-    llama_backend_free();
			
 
				-  });
			
 
				-}
			
 
				-
			
 
				-void llama_server_stop() {
			
 
				-  assert(llama != NULL);
			
 
				-  // Shutdown any in-flight requests and block incoming requests.
			
 
				-  LOG_TEE("\ninitiating shutdown - draining remaining tasks...\n");
			
 
				-  shutting_down = true;
			
 
				-
			
 
				-  while (recv_counter.load() > 0) {
			
 
				-    std::this_thread::sleep_for(std::chrono::milliseconds(50));
			
 
				-  }
			
 
				-
			
 
				-  // This may take a while for any pending tasks to drain
			
 
				-  // TODO - consider a timeout to cancel tasks if it's taking too long
			
 
				-  llama->queue_tasks.terminate();
			
 
				-  ext_server_thread.join();
			
 
				-  delete llama;
			
 
				-  llama = NULL;
			
 
				-  LOG_TEE("llama server shutdown complete\n");
			
 
				-  shutting_down = false;
			
 
				-}
			
 
				-
			
 
				-void llama_server_completion(const char *json_req, ext_server_resp_t *resp) {
			
 
				-  assert(llama != NULL && json_req != NULL && resp != NULL);
			
 
				-  resp->id = -1;
			
 
				-  resp->msg[0] = '\0';
			
 
				-  try {
			
 
				-    if (shutting_down) {
			
 
				-      throw std::runtime_error("server shutting down");
			
 
				-    }
			
 
				-    json data = json::parse(json_req);
			
 
				-    resp->id = llama->queue_tasks.get_new_id();
			
 
				-    llama->queue_results.add_waiting_task_id(resp->id);
			
 
				-    llama->request_completion(resp->id, data, false, false, -1);
			
 
				-  } catch (std::exception &e) {
			
 
				-    snprintf(resp->msg, resp->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    snprintf(resp->msg, resp->msg_len, "Unknown exception during completion");
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void llama_server_completion_next_result(const int task_id,
			
 
				-                                         ext_server_task_result_t *resp) {
			
 
				-  assert(llama != NULL && resp != NULL);
			
 
				-  resp->id = -1;
			
 
				-  resp->stop = false;
			
 
				-  resp->error = false;
			
 
				-  resp->json_resp = NULL;
			
 
				-  std::string result_json;
			
 
				-  try {
			
 
				-    atomicRecv ar(recv_counter);
			
 
				-    task_result result = llama->queue_results.recv(task_id);
			
 
				-    result_json =
			
 
				-        result.result_json.dump(-1, ' ', false, json::error_handler_t::replace);
			
 
				-    resp->id = result.id;
			
 
				-    resp->stop = result.stop;
			
 
				-    resp->error = result.error;
			
 
				-    if (result.error) {
			
 
				-      LOG_TEE("next result cancel on error\n");
			
 
				-      llama->request_cancel(task_id);
			
 
				-      LOG_TEE("next result removing waiting tak ID: %d\n", task_id);
			
 
				-      llama->queue_results.remove_waiting_task_id(task_id);
			
 
				-    } else if (result.stop) {
			
 
				-      LOG_TEE("next result cancel on stop\n");
			
 
				-      llama->request_cancel(task_id);
			
 
				-      LOG_TEE("next result removing waiting task ID: %d\n", task_id);
			
 
				-      llama->queue_results.remove_waiting_task_id(task_id);
			
 
				-    } else if (shutting_down) {
			
 
				-      LOG_TEE("aborting completion due to shutdown %d\n", task_id);
			
 
				-      llama->request_cancel(task_id);
			
 
				-      llama->queue_results.remove_waiting_task_id(task_id);
			
 
				-      resp->stop = true;
			
 
				-    }
			
 
				-  } catch (std::exception &e) {
			
 
				-    resp->error = true;
			
 
				-    resp->id = -1;
			
 
				-    result_json = "{\"error\":\"exception " + std::string(e.what()) + "\"}";
			
 
				-    LOG_TEE("llama server completion exception %s\n", e.what());
			
 
				-  } catch (...) {
			
 
				-    resp->error = true;
			
 
				-    resp->id = -1;
			
 
				-    result_json = "{\"error\":\"Unknown exception during completion\"}";
			
 
				-    LOG_TEE("llama server completion unknown exception\n");
			
 
				-  }
			
 
				-  const std::string::size_type size = result_json.size() + 1;
			
 
				-  resp->json_resp = new char[size];
			
 
				-  snprintf(resp->json_resp, size, "%s", result_json.c_str());
			
 
				-}
			
 
				-
			
 
				-void llama_server_release_task_result(ext_server_task_result_t *result) {
			
 
				-  if (result == NULL || result->json_resp == NULL) {
			
 
				-    return;
			
 
				-  }
			
 
				-  delete[] result->json_resp;
			
 
				-}
			
 
				-
			
 
				-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err) {
			
 
				-  assert(llama != NULL && err != NULL);
			
 
				-  err->id = 0;
			
 
				-  err->msg[0] = '\0';
			
 
				-  try {
			
 
				-    llama->request_cancel(task_id);
			
 
				-    llama->queue_results.remove_waiting_task_id(task_id);
			
 
				-  } catch (std::exception &e) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len,
			
 
				-             "Unknown exception completion cancel in llama server");
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void llama_server_tokenize(const char *json_req, char **json_resp,
			
 
				-                           ext_server_resp_t *err) {
			
 
				-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
			
 
				-  *json_resp = NULL;
			
 
				-  err->id = 0;
			
 
				-  err->msg[0] = '\0';
			
 
				-  try {
			
 
				-    if (shutting_down) {
			
 
				-      throw std::runtime_error("server shutting down");
			
 
				-    }
			
 
				-    const json body = json::parse(json_req);
			
 
				-    std::vector<llama_token> tokens;
			
 
				-    if (body.count("content") != 0) {
			
 
				-      tokens = llama->tokenize(body["content"], false);
			
 
				-    }
			
 
				-    const json data = format_tokenizer_response(tokens);
			
 
				-    std::string result_json = data.dump();
			
 
				-    const std::string::size_type size = result_json.size() + 1;
			
 
				-    *json_resp = new char[size];
			
 
				-    snprintf(*json_resp, size, "%s", result_json.c_str());
			
 
				-  } catch (std::exception &e) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "Unknown exception during tokenize");
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void llama_server_release_json_resp(char **json_resp) {
			
 
				-  if (json_resp == NULL || *json_resp == NULL) {
			
 
				-    return;
			
 
				-  }
			
 
				-  delete[] *json_resp;
			
 
				-}
			
 
				-
			
 
				-void llama_server_detokenize(const char *json_req, char **json_resp,
			
 
				-                             ext_server_resp_t *err) {
			
 
				-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
			
 
				-  *json_resp = NULL;
			
 
				-  err->id = 0;
			
 
				-  err->msg[0] = '\0';
			
 
				-  try {
			
 
				-    if (shutting_down) {
			
 
				-      throw std::runtime_error("server shutting down");
			
 
				-    }
			
 
				-    const json body = json::parse(json_req);
			
 
				-    std::string content;
			
 
				-    if (body.count("tokens") != 0) {
			
 
				-      const std::vector<llama_token> tokens = body["tokens"];
			
 
				-      content = tokens_to_str(llama->ctx, tokens.cbegin(), tokens.cend());
			
 
				-    }
			
 
				-    const json data = format_detokenized_response(content);
			
 
				-    std::string result_json = data.dump();
			
 
				-    const std::string::size_type size = result_json.size() + 1;
			
 
				-    *json_resp = new char[size];
			
 
				-    snprintf(*json_resp, size, "%s", result_json.c_str());
			
 
				-  } catch (std::exception &e) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "Unknown exception during detokenize");
			
 
				-  }
			
 
				-}
			
 
				-
			
 
				-void llama_server_embedding(const char *json_req, char **json_resp,
			
 
				-                            ext_server_resp_t *err) {
			
 
				-  assert(llama != NULL && json_req != NULL && json_resp != NULL && err != NULL);
			
 
				-  *json_resp = NULL;
			
 
				-  err->id = 0;
			
 
				-  err->msg[0] = '\0';
			
 
				-  try {
			
 
				-    if (shutting_down) {
			
 
				-      throw std::runtime_error("server shutting down");
			
 
				-    }
			
 
				-    const json body = json::parse(json_req);
			
 
				-    json prompt;
			
 
				-    if (body.count("content") != 0) {
			
 
				-      prompt = body["content"];
			
 
				-    } else {
			
 
				-      prompt = "";
			
 
				-    }
			
 
				-    const int task_id = llama->queue_tasks.get_new_id();
			
 
				-    llama->queue_results.add_waiting_task_id(task_id);
			
 
				-    llama->request_completion(task_id, {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1);
			
 
				-    atomicRecv ar(recv_counter);
			
 
				-    task_result result = llama->queue_results.recv(task_id);
			
 
				-    std::string result_json = result.result_json.dump();
			
 
				-    const std::string::size_type size = result_json.size() + 1;
			
 
				-    *json_resp = new char[size];
			
 
				-    snprintf(*json_resp, size, "%s", result_json.c_str());
			
 
				-    llama->queue_results.remove_waiting_task_id(task_id);
			
 
				-  } catch (std::exception &e) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "exception %s", e.what());
			
 
				-  } catch (...) {
			
 
				-    err->id = -1;
			
 
				-    snprintf(err->msg, err->msg_len, "Unknown exception during embedding");
			
 
				-  }
			
 
				-}
			
--- a/llm/ext_server/ext_server.h
+++ b/llm/ext_server/ext_server.h
@@ -1,95 +0,0 @@
 
				-#if defined(LLAMA_SERVER_LIBRARY)
			
 
				-#ifndef LLAMA_SERVER_H
			
 
				-#define LLAMA_SERVER_H
			
 
				-#include <stdbool.h>
			
 
				-#include <stddef.h>
			
 
				-#include <stdint.h>
			
 
				-#include <stdio.h>
			
 
				-
			
 
				-int __main(int argc, char **argv);
			
 
				-
			
 
				-// This exposes extern C entrypoints into the llama_server
			
 
				-// To enable the server compile with LLAMA_SERVER_LIBRARY
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-typedef struct ext_server_resp {
			
 
				-  int id;          // < 0 on error
			
 
				-  size_t msg_len;  // caller must allocate msg and set msg_len
			
 
				-  char *msg;
			
 
				-} ext_server_resp_t;
			
 
				-
			
 
				-// Allocated and freed by caller
			
 
				-typedef struct ext_server_lora_adapter {
			
 
				-  char *adapter;
			
 
				-  float scale;
			
 
				-  struct ext_server_lora_adapter *next;
			
 
				-} ext_server_lora_adapter_t;
			
 
				-
			
 
				-// Allocated and freed by caller
			
 
				-typedef struct ext_server_params {
			
 
				-  char *model;
			
 
				-  uint32_t n_ctx;         // token context window, 0 = from model
			
 
				-  uint32_t n_batch;       // prompt processing maximum batch size
			
 
				-  uint32_t n_threads;     // number of threads to use for generation
			
 
				-  int32_t n_parallel;     // number of parallel sequences to decodewra
			
 
				-  float rope_freq_base;   // RoPE base frequency, 0 = from model
			
 
				-  float rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
			
 
				-  bool memory_f16;        // use f16 instead of f32 for memory kv
			
 
				-  int32_t n_gpu_layers;  // number of layers to store in VRAM (-1 - use default)
			
 
				-  int32_t main_gpu;      // the GPU that is used for scratch and small tensors
			
 
				-  bool use_mlock;        // force system to keep model in RAM
			
 
				-  bool use_mmap;         // use mmap if possible
			
 
				-  int numa;              // attempt optimizations that help on some NUMA systems
			
 
				-  bool embedding;        // get only sentence embedding
			
 
				-  ext_server_lora_adapter_t *lora_adapters;
			
 
				-  char *mmproj;
			
 
				-  bool verbose_logging;  // Enable verbose logging of the server
			
 
				-} ext_server_params_t;
			
 
				-
			
 
				-typedef struct ext_server_task_result {
			
 
				-  int id;
			
 
				-  bool stop;
			
 
				-  bool error;
			
 
				-  char *json_resp;  // null terminated, memory managed by ext_server
			
 
				-} ext_server_task_result_t;
			
 
				-
			
 
				-// Initialize the server once per process
			
 
				-// err->id = 0 for success and err->msg[0] = NULL
			
 
				-// err->id != 0 for failure, and err->msg contains error message
			
 
				-void llama_server_init(ext_server_params_t *sparams, ext_server_resp_t *err);
			
 
				-
			
 
				-// Run the main loop, called once per init
			
 
				-void llama_server_start();
			
 
				-// Stop the main loop and free up resources allocated in init and start.  Init
			
 
				-// must be called again to reuse
			
 
				-void llama_server_stop();
			
 
				-
			
 
				-// json_req null terminated string, memory managed by caller
			
 
				-// resp->id >= 0 on success (task ID)
			
 
				-// resp->id < 0 on error, and resp->msg contains error message
			
 
				-void llama_server_completion(const char *json_req, ext_server_resp_t *resp);
			
 
				-
			
 
				-// Caller must call llama_server_release_task_result to free resp->json_resp
			
 
				-void llama_server_completion_next_result(const int task_id,
			
 
				-                                         ext_server_task_result_t *result);
			
 
				-void llama_server_completion_cancel(const int task_id, ext_server_resp_t *err);
			
 
				-void llama_server_release_task_result(ext_server_task_result_t *result);
			
 
				-
			
 
				-// Caller must call llama_server_releaes_json_resp to free json_resp if err.id <
			
 
				-// 0
			
 
				-void llama_server_tokenize(const char *json_req, char **json_resp,
			
 
				-                           ext_server_resp_t *err);
			
 
				-void llama_server_detokenize(const char *json_req, char **json_resp,
			
 
				-                             ext_server_resp_t *err);
			
 
				-void llama_server_embedding(const char *json_req, char **json_resp,
			
 
				-                            ext_server_resp_t *err);
			
 
				-void llama_server_release_json_resp(char **json_resp);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
 
				-#endif  // LLAMA_SERVER_LIBRARY
			
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -2768,7 +2768,7 @@ inline void signal_handler(int signal) {
 
				     shutdown_handler(signal);
			
 
				 }
			
 
				 
			
 
				-int _main(int argc, char **argv)
			
 
				+int main(int argc, char **argv)
			
 
				 {
			
 
				 #if SERVER_VERBOSE != 1
			
 
				     log_disable();
			
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -14,7 +14,7 @@ init_vars() {
 
				 
			
 
				     LLAMACPP_DIR=../llama.cpp
			
 
				     CMAKE_DEFS=""
			
 
				-    CMAKE_TARGETS="--target ext_server"
			
 
				+    CMAKE_TARGETS="--target ollama_llama_server"
			
 
				     if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
			
 
				         CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on -DLLAMA_SERVER_VERBOSE=on ${CMAKE_DEFS}"
			
 
				     else
			
@@ -81,27 +81,24 @@ apply_patches() {
 
				 build() {
			
 
				     cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
			
 
				     cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
			
 
				-    mkdir -p ${BUILD_DIR}/lib/
			
 
				-    ls ${BUILD_DIR}
			
 
				-    g++ -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.${LIB_EXT} \
			
 
				-        ${GCC_ARCH} \
			
 
				-        ${WHOLE_ARCHIVE} ${BUILD_DIR}/ext_server/libext_server.a ${NO_WHOLE_ARCHIVE} \
			
 
				-        ${BUILD_DIR}/common/libcommon.a \
			
 
				-        ${BUILD_DIR}/libllama.a \
			
 
				-        -Wl,-rpath,\$ORIGIN \
			
 
				-        -lpthread -ldl -lm \
			
 
				-        ${EXTRA_LIBS}
			
 
				 }
			
 
				 
			
 
				-compress_libs() {
			
 
				+compress() {
			
 
				     echo "Compressing payloads to reduce overall binary size..."
			
 
				     pids=""
			
 
				-    rm -rf ${BUILD_DIR}/lib/*.${LIB_EXT}*.gz
			
 
				-    for lib in ${BUILD_DIR}/lib/*.${LIB_EXT}* ; do
			
 
				-        gzip -n --best -f ${lib} &
			
 
				+    rm -rf ${BUILD_DIR}/bin/*.gz
			
 
				+    for f in ${BUILD_DIR}/bin/* ; do
			
 
				+        gzip -n --best -f ${f} &
			
 
				         pids+=" $!"
			
 
				     done
			
 
				-    echo 
			
 
				+    # check for lib directory
			
 
				+    if [ -d ${BUILD_DIR}/lib ]; then
			
 
				+        for f in ${BUILD_DIR}/lib/* ; do
			
 
				+            gzip -n --best -f ${f} &
			
 
				+            pids+=" $!"
			
 
				+        done
			
 
				+    fi
			
 
				+    echo
			
 
				     for pid in ${pids}; do
			
 
				         wait $pid
			
 
				     done
			
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,21 +18,31 @@ sign() {
 
				     fi
			
 
				 }
			
 
				 
			
 
				-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin"
			
 
				+COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.0 -DCMAKE_SYSTEM_NAME=Darwin -DLLAMA_METAL_EMBED_LIBRARY=on"
			
 
				 
			
 
				 case "${GOARCH}" in
			
 
				 "amd64")
			
 
				     COMMON_CPU_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=off -DLLAMA_NATIVE=off"
			
 
				 
			
 
				+    # Static build for linking into the Go binary
			
 
				+    init_vars
			
 
				+    CMAKE_TARGETS="--target llama --target ggml"
			
 
				+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}_static"
			
 
				+    echo "Building static library"
			
 
				+    build
			
 
				+
			
 
				+
			
 
				     #
			
 
				     # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
			
 
				     #
			
 
				+    init_vars
			
 
				     CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}/cpu"
			
 
				     echo "Building LCD CPU"
			
 
				     build
			
 
				-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu/lib/libext_server.dylib
			
 
				-    compress_libs
			
 
				+    sign ${BUILD_DIR}/lib/libext_server.dylib
			
 
				+    compress
			
 
				 
			
 
				     #
			
 
				     # ~2011 CPU Dynamic library with more capabilities turned on to optimize performance
			
@@ -40,11 +50,11 @@ case "${GOARCH}" in
 
				     #
			
 
				     init_vars
			
 
				     CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx"
			
 
				     echo "Building AVX CPU"
			
 
				     build
			
 
				-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx/lib/libext_server.dylib
			
 
				-    compress_libs
			
 
				+    sign ${BUILD_DIR}/lib/libext_server.dylib
			
 
				+    compress
			
 
				 
			
 
				     #
			
 
				     # ~2013 CPU Dynamic library
			
@@ -52,20 +62,30 @@ case "${GOARCH}" in
 
				     #
			
 
				     init_vars
			
 
				     CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_ACCELERATE=on -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}/cpu_avx2"
			
 
				     echo "Building AVX2 CPU"
			
 
				     EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation"
			
 
				     build
			
 
				-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/cpu_avx2/lib/libext_server.dylib
			
 
				-    compress_libs
			
 
				+    sign ${BUILD_DIR}/lib/libext_server.dylib
			
 
				+    compress
			
 
				     ;;
			
 
				 "arm64")
			
 
				+
			
 
				+    # Static build for linking into the Go binary
			
 
				+    init_vars
			
 
				+    CMAKE_TARGETS="--target llama --target ggml"
			
 
				+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DLLAMA_ACCELERATE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}_static"
			
 
				+    echo "Building static library"
			
 
				+    build
			
 
				+
			
 
				+    init_vars
			
 
				     CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DLLAMA_METAL_EMBED_LIBRARY=on -DLLAMA_ACCELERATE=on -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} -DLLAMA_METAL=on ${CMAKE_DEFS}"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/darwin/${ARCH}/metal"
			
 
				+    BUILD_DIR="../build/darwin/${ARCH}/metal"
			
 
				     EXTRA_LIBS="${EXTRA_LIBS} -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders"
			
 
				     build
			
 
				-    sign ${LLAMACPP_DIR}/build/darwin/${ARCH}/metal/lib/libext_server.dylib
			
 
				-    compress_libs
			
 
				+    sign ${BUILD_DIR}/lib/libext_server.dylib
			
 
				+    compress
			
 
				     ;;
			
 
				 *)
			
 
				     echo "GOARCH must be set"
			
@@ -75,3 +95,4 @@ case "${GOARCH}" in
 
				 esac
			
 
				 
			
 
				 cleanup
			
 
				+echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
			
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -57,16 +57,31 @@ init_vars
 
				 git_module_setup
			
 
				 apply_patches
			
 
				 
			
 
				+
			
 
				+init_vars
			
 
				 if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
			
 
				+
			
 
				+    if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "static" ]; then
			
 
				+        # Static build for linking into the Go binary
			
 
				+        init_vars
			
 
				+        CMAKE_TARGETS="--target llama --target ggml"
			
 
				+        CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DLLAMA_NATIVE=off -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				+        BUILD_DIR="../build/linux/${ARCH}_static"
			
 
				+        echo "Building static library"
			
 
				+        build
			
 
				+    fi
			
 
				+
			
 
				+
			
 
				     # Users building from source can tune the exact flags we pass to cmake for configuring
			
 
				     # llama.cpp, and we'll build only 1 CPU variant in that case as the default.
			
 
				     if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
			
 
				+        init_vars
			
 
				         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
			
 
				         CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
			
 
				-        BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
			
 
				+        BUILD_DIR="../build/linux/${ARCH}/cpu"
			
 
				         echo "Building custom CPU"
			
 
				         build
			
 
				-        compress_libs
			
 
				+        compress
			
 
				     else
			
 
				         # Darwin Rosetta x86 emulation does NOT support AVX, AVX2, AVX512
			
 
				         # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
			
@@ -83,11 +98,12 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
 
				             #
			
 
				             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
			
 
				             #
			
 
				+            init_vars
			
 
				             CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-            BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu"
			
 
				+            BUILD_DIR="../build/linux/${ARCH}/cpu"
			
 
				             echo "Building LCD CPU"
			
 
				             build
			
 
				-            compress_libs
			
 
				+            compress
			
 
				         fi
			
 
				 
			
 
				         if [ "${ARCH}" == "x86_64" ]; then
			
@@ -101,10 +117,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
 
				                 #
			
 
				                 init_vars
			
 
				                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx"
			
 
				+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx"
			
 
				                 echo "Building AVX CPU"
			
 
				                 build
			
 
				-                compress_libs
			
 
				+                compress
			
 
				             fi
			
 
				 
			
 
				             if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu_avx2" ]; then
			
@@ -114,10 +130,10 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
 
				                 #
			
 
				                 init_vars
			
 
				                 CMAKE_DEFS="${COMMON_CPU_DEFS} -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on ${CMAKE_DEFS}"
			
 
				-                BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cpu_avx2"
			
 
				+                BUILD_DIR="../build/linux/${ARCH}/cpu_avx2"
			
 
				                 echo "Building AVX2 CPU"
			
 
				                 build
			
 
				-                compress_libs
			
 
				+                compress
			
 
				             fi
			
 
				         fi
			
 
				     fi
			
@@ -157,7 +173,7 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
 
				         ARM64_DEFS="-DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_CUDA_F16=off"
			
 
				     fi
			
 
				     CMAKE_DEFS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_FORCE_MMQ=on -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES} ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} ${ARM64_DEFS}"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/cuda${CUDA_VARIANT}"
			
 
				+    BUILD_DIR="../build/linux/${ARCH}/cuda${CUDA_VARIANT}"
			
 
				     EXTRA_LIBS="-L${CUDA_LIB_DIR} -lcudart -lcublas -lcublasLt -lcuda"
			
 
				     build
			
 
				 
			
@@ -165,20 +181,20 @@ if [ -d "${CUDA_LIB_DIR}" ]; then
 
				     #
			
 
				     # TODO - in the future we may shift to packaging these separately and conditionally
			
 
				     #        downloading them in the install script.
			
 
				-    DEPS="$(ldd ${BUILD_DIR}/lib/libext_server.so )"
			
 
				+    DEPS="$(ldd ${BUILD_DIR}/bin/ollama_llama_server )"
			
 
				     for lib in libcudart.so libcublas.so libcublasLt.so ; do
			
 
				         DEP=$(echo "${DEPS}" | grep ${lib} | cut -f1 -d' ' | xargs || true)
			
 
				         if [ -n "${DEP}" -a -e "${CUDA_LIB_DIR}/${DEP}" ]; then
			
 
				-            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/lib/"
			
 
				+            cp "${CUDA_LIB_DIR}/${DEP}" "${BUILD_DIR}/bin/"
			
 
				         elif [ -e "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" ]; then
			
 
				-            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/lib/"
			
 
				+            cp "${CUDA_LIB_DIR}/${lib}.${CUDA_MAJOR}" "${BUILD_DIR}/bin/"
			
 
				         elif [ -e "${CUDART_LIB_DIR}/${lib}" ]; then
			
 
				-            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/lib/"
			
 
				+            cp -d ${CUDART_LIB_DIR}/${lib}* "${BUILD_DIR}/bin/"
			
 
				         else
			
 
				-            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/lib/"
			
 
				+            cp -d "${CUDA_LIB_DIR}/${lib}*" "${BUILD_DIR}/bin/"
			
 
				         fi
			
 
				     done
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 
			
 
				 fi
			
 
				 
			
@@ -201,23 +217,24 @@ if [ -d "${ROCM_PATH}" ]; then
 
				     fi
			
 
				     init_vars
			
 
				     CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
			
 
				-    BUILD_DIR="${LLAMACPP_DIR}/build/linux/${ARCH}/rocm${ROCM_VARIANT}"
			
 
				+    BUILD_DIR="../build/linux/${ARCH}/rocm${ROCM_VARIANT}"
			
 
				     EXTRA_LIBS="-L${ROCM_PATH}/lib -L/opt/amdgpu/lib/x86_64-linux-gnu/ -Wl,-rpath,\$ORIGIN/../../rocm/ -lhipblas -lrocblas -lamdhip64 -lrocsolver -lamd_comgr -lhsa-runtime64 -lrocsparse -ldrm -ldrm_amdgpu"
			
 
				     build
			
 
				 
			
 
				     # Record the ROCM dependencies
			
 
				-    rm -f "${BUILD_DIR}/lib/deps.txt"
			
 
				-    touch "${BUILD_DIR}/lib/deps.txt"
			
 
				-    for dep in $(ldd "${BUILD_DIR}/lib/libext_server.so" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
			
 
				-        echo "${dep}" >> "${BUILD_DIR}/lib/deps.txt"
			
 
				+    rm -f "${BUILD_DIR}/bin/deps.txt"
			
 
				+    touch "${BUILD_DIR}/bin/deps.txt"
			
 
				+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e rocm -e amdgpu -e libtinfo ); do
			
 
				+        echo "${dep}" >> "${BUILD_DIR}/bin/deps.txt"
			
 
				     done
			
 
				     # bomb out if for some reason we didn't get a few deps
			
 
				-    if [ $(cat "${BUILD_DIR}/lib/deps.txt" | wc -l ) -lt 8 ] ; then
			
 
				-        cat "${BUILD_DIR}/lib/deps.txt"
			
 
				+    if [ $(cat "${BUILD_DIR}/bin/deps.txt" | wc -l ) -lt 8 ] ; then
			
 
				+        cat "${BUILD_DIR}/bin/deps.txt"
			
 
				         echo "ERROR: deps file short"
			
 
				         exit 1
			
 
				     fi
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 fi
			
 
				 
			
 
				 cleanup
			
 
				+echo "go generate completed.  LLM runners: $(cd ${BUILD_DIR}/..; echo *)"
			
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -33,7 +33,7 @@ function init_vars {
 
				         "-DBUILD_SHARED_LIBS=on",
			
 
				         "-DLLAMA_NATIVE=off"
			
 
				         )
			
 
				-    $script:cmakeTargets = @("ext_server")
			
 
				+    $script:cmakeTargets = @("ollama_llama_server")
			
 
				     $script:ARCH = "amd64" # arm not yet supported.
			
 
				     if ($env:CGO_CFLAGS -contains "-g") {
			
 
				         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on", "-DCMAKE_BUILD_TYPE=RelWithDebInfo")
			
@@ -97,16 +97,14 @@ function apply_patches {
 
				         }
			
 
				 
			
 
				         # Checkout each file
			
 
				-        Set-Location -Path ${script:llamacppDir}
			
 
				         foreach ($file in $filePaths) {
			
 
				-            git checkout $file
			
 
				+            git -C "${script:llamacppDir}" checkout $file
			
 
				         }
			
 
				     }
			
 
				 
			
 
				     # Apply each patch
			
 
				     foreach ($patch in $patches) {
			
 
				-        Set-Location -Path ${script:llamacppDir}
			
 
				-        git apply $patch.FullName
			
 
				+        git -C "${script:llamacppDir}" apply $patch.FullName
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -115,41 +113,41 @@ function build {
 
				     & cmake --version
			
 
				     & cmake -S "${script:llamacppDir}" -B $script:buildDir $script:cmakeDefs
			
 
				     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
			
 
				-    write-host "building with: cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })"
			
 
				+    write-host "building with: cmake --build $script:buildDir --config $script:config $($script:cmakeTargets | ForEach-Object { `"--target`", $_ })"
			
 
				     & cmake --build $script:buildDir --config $script:config ($script:cmakeTargets | ForEach-Object { "--target", $_ })
			
 
				     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
			
 
				-}
			
 
				-
			
 
				-function install {
			
 
				-    rm -ea 0 -recurse -force -path "${script:buildDir}/lib"
			
 
				-    md "${script:buildDir}/lib" -ea 0 > $null
			
 
				-    cp "${script:buildDir}/bin/${script:config}/ext_server.dll" "${script:buildDir}/lib"
			
 
				-    cp "${script:buildDir}/bin/${script:config}/llama.dll" "${script:buildDir}/lib"
			
 
				-    # Display the dll dependencies in the build log
			
 
				-    if ($script:DUMPBIN -ne $null) {
			
 
				-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
			
 
				+    # Rearrange output to be consistent between different generators
			
 
				+    if ($null -ne ${script:config} -And (test-path -path "${script:buildDir}/bin/${script:config}" ) ) {
			
 
				+        mv -force "${script:buildDir}/bin/${script:config}/*" "${script:buildDir}/bin/"
			
 
				+        remove-item "${script:buildDir}/bin/${script:config}"
			
 
				     }
			
 
				 }
			
 
				 
			
 
				 function sign {
			
 
				     if ("${env:KEY_CONTAINER}") {
			
 
				-        write-host "Signing ${script:buildDir}/lib/*.dll"
			
 
				-        foreach ($file in (get-childitem "${script:buildDir}/lib/*.dll")){
			
 
				-            & "${script:SignTool}" sign /v /debug /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
			
 
				+        write-host "Signing ${script:buildDir}/bin/*.exe  ${script:buildDir}/bin/*.dll"
			
 
				+        foreach ($file in @(get-childitem "${script:buildDir}/bin/*.exe") + @(get-childitem "${script:buildDir}/bin/*.dll")){
			
 
				+            & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
			
 
				                 /csp "Google Cloud KMS Provider" /kc "${env:KEY_CONTAINER}" $file
			
 
				             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-function compress_libs {
			
 
				+function compress {
			
 
				     if ($script:GZIP -eq $null) {
			
 
				         write-host "gzip not installed, not compressing files"
			
 
				         return
			
 
				     }
			
 
				+    write-host "Compressing binaries..."
			
 
				+    $binaries = dir "${script:buildDir}/bin/*.exe"
			
 
				+    foreach ($file in $binaries) {
			
 
				+        & "$script:GZIP" --best -f $file
			
 
				+    }
			
 
				+
			
 
				     write-host "Compressing dlls..."
			
 
				-    $libs = dir "${script:buildDir}/lib/*.dll"
			
 
				-    foreach ($file in $libs) {
			
 
				+    $binaries = dir "${script:buildDir}/bin/*.dll"
			
 
				+    foreach ($file in $dlls) {
			
 
				         & "$script:GZIP" --best -f $file
			
 
				     }
			
 
				 }
			
@@ -164,14 +162,11 @@ function cleanup {
 
				         }
			
 
				 
			
 
				         # Checkout each file
			
 
				-        Set-Location -Path ${script:llamacppDir}
			
 
				         foreach ($file in $filePaths) {            
			
 
				-            git checkout $file
			
 
				+            git -C "${script:llamacppDir}" checkout $file
			
 
				         }
			
 
				+        git -C "${script:llamacppDir}" checkout CMakeLists.txt
			
 
				     }
			
 
				-    Set-Location "${script:llamacppDir}/"
			
 
				-    git checkout CMakeLists.txt
			
 
				-
			
 
				 }
			
 
				 
			
 
				 init_vars
			
@@ -179,7 +174,6 @@ git_module_setup
 
				 apply_patches
			
 
				 
			
 
				 # -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
			
 
				-# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
			
 
				 # -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
			
 
				 # -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
			
 
				 
			
@@ -187,32 +181,46 @@ $script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on")
 
				 
			
 
				 if ($null -eq ${env:OLLAMA_SKIP_CPU_GENERATE}) {
			
 
				 
			
 
				+# GCC build for direct linking into the Go binary
			
 
				+init_vars
			
 
				+$script:cmakeTargets = @("llama", "ggml")
			
 
				+$script:cmakeDefs = @(
			
 
				+    "-G", "MinGW Makefiles"
			
 
				+    "-DBUILD_SHARED_LIBS=off",
			
 
				+    "-DLLAMA_NATIVE=off",
			
 
				+    "-DLLAMA_AVX=off",
			
 
				+    "-DLLAMA_AVX2=off",
			
 
				+    "-DLLAMA_AVX512=off",
			
 
				+    "-DLLAMA_F16C=off",
			
 
				+    "-DLLAMA_FMA=off")
			
 
				+$script:buildDir="../build/windows/${script:ARCH}_static"
			
 
				+write-host "Building static library"
			
 
				+build
			
 
				+
			
 
				+# remaining llama.cpp builds use MSVC 
			
 
				     init_vars
			
 
				     $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
			
 
				-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu"
			
 
				+    $script:buildDir="../build/windows/${script:ARCH}/cpu"
			
 
				     write-host "Building LCD CPU"
			
 
				     build
			
 
				-    install
			
 
				     sign
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 
			
 
				     init_vars
			
 
				     $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
			
 
				-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx"
			
 
				+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx"
			
 
				     write-host "Building AVX CPU"
			
 
				     build
			
 
				-    install
			
 
				     sign
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 
			
 
				     init_vars
			
 
				     $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
			
 
				-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cpu_avx2"
			
 
				+    $script:buildDir="../build/windows/${script:ARCH}/cpu_avx2"
			
 
				     write-host "Building AVX2 CPU"
			
 
				     build
			
 
				-    install
			
 
				     sign
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 } else {
			
 
				     write-host "Skipping CPU generation step as requested"
			
 
				 }
			
@@ -225,13 +233,11 @@ if ($null -ne $script:CUDA_LIB_DIR) {
 
				         $script:CUDA_VARIANT="_"+$script:CUDA_VERSION
			
 
				     }
			
 
				     init_vars
			
 
				-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
			
 
				+    $script:buildDir="../build/windows/${script:ARCH}/cuda$script:CUDA_VARIANT"
			
 
				     $script:cmakeDefs += @("-A", "x64", "-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DCUDAToolkit_INCLUDE_DIR=$script:CUDA_INCLUDE_DIR", "-DCMAKE_CUDA_ARCHITECTURES=${script:CMAKE_CUDA_ARCHITECTURES}")
			
 
				-    write-host "Building CUDA"
			
 
				     build
			
 
				-    install
			
 
				     sign
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 }
			
 
				 
			
 
				 if ($null -ne $env:HIP_PATH) {
			
@@ -241,7 +247,7 @@ if ($null -ne $env:HIP_PATH) {
 
				     }
			
 
				 
			
 
				     init_vars
			
 
				-    $script:buildDir="${script:llamacppDir}/build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
			
 
				+    $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
			
 
				     $script:cmakeDefs += @(
			
 
				         "-G", "Ninja", 
			
 
				         "-DCMAKE_C_COMPILER=clang.exe",
			
@@ -264,13 +270,13 @@ if ($null -ne $env:HIP_PATH) {
 
				     build
			
 
				     # Ninja doesn't prefix with config name
			
 
				     ${script:config}=""
			
 
				-    install
			
 
				     if ($null -ne $script:DUMPBIN) {
			
 
				-        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/${script:config}/ext_server.dll" | select-string ".dll"
			
 
				+        & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | select-string ".dll"
			
 
				     }
			
 
				     sign
			
 
				-    compress_libs
			
 
				+    compress
			
 
				 }
			
 
				 
			
 
				+
			
 
				 cleanup
			
 
				-write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\llama.cpp\build\windows\${script:ARCH})"
			
 
				+write-host "`ngo generate completed.  LLM runners: $(get-childitem -path ${script:SRC_DIR}\llm\build\windows\${script:ARCH})"
			
--- a/llm/generate/generate_darwin.go
+++ b/llm/generate/generate_darwin.go
@@ -1,3 +1,3 @@
 
				 package generate
			
 
				 
			
 
				-//go:generate sh ./gen_darwin.sh
			
 
				+//go:generate bash ./gen_darwin.sh
			
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -1,100 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	_ "embed"
			
 
				-	"fmt"
			
 
				-	"time"
			
 
				-
			
 
				-	"github.com/ollama/ollama/api"
			
 
				-)
			
 
				-
			
 
				-const jsonGrammar = `
			
 
				-root   ::= object
			
 
				-value  ::= object | array | string | number | ("true" | "false" | "null") ws
			
 
				-
			
 
				-object ::=
			
 
				-  "{" ws (
			
 
				-            string ":" ws value
			
 
				-    ("," ws string ":" ws value)*
			
 
				-  )? "}" ws
			
 
				-
			
 
				-array  ::=
			
 
				-  "[" ws (
			
 
				-            value
			
 
				-    ("," ws value)*
			
 
				-  )? "]" ws
			
 
				-
			
 
				-string ::=
			
 
				-  "\"" (
			
 
				-    [^"\\] |
			
 
				-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
			
 
				-  )* "\"" ws
			
 
				-
			
 
				-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
			
 
				-
			
 
				-# Optional space: by convention, applied in this grammar after literal chars when allowed
			
 
				-ws ::= ([ \t\n] ws)?
			
 
				-`
			
 
				-
			
 
				-type ImageData struct {
			
 
				-	Data []byte `json:"data"`
			
 
				-	ID   int    `json:"id"`
			
 
				-}
			
 
				-
			
 
				-var payloadMissing = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
			
 
				-
			
 
				-type prediction struct {
			
 
				-	Content string `json:"content"`
			
 
				-	Model   string `json:"model"`
			
 
				-	Prompt  string `json:"prompt"`
			
 
				-	Stop    bool   `json:"stop"`
			
 
				-
			
 
				-	Timings struct {
			
 
				-		PredictedN  int     `json:"predicted_n"`
			
 
				-		PredictedMS float64 `json:"predicted_ms"`
			
 
				-		PromptN     int     `json:"prompt_n"`
			
 
				-		PromptMS    float64 `json:"prompt_ms"`
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-const maxRetries = 3
			
 
				-
			
 
				-type PredictOpts struct {
			
 
				-	Prompt  string
			
 
				-	Format  string
			
 
				-	Images  []ImageData
			
 
				-	Options api.Options
			
 
				-}
			
 
				-
			
 
				-type PredictResult struct {
			
 
				-	Content            string
			
 
				-	Done               bool
			
 
				-	PromptEvalCount    int
			
 
				-	PromptEvalDuration time.Duration
			
 
				-	EvalCount          int
			
 
				-	EvalDuration       time.Duration
			
 
				-}
			
 
				-
			
 
				-type TokenizeRequest struct {
			
 
				-	Content string `json:"content"`
			
 
				-}
			
 
				-
			
 
				-type TokenizeResponse struct {
			
 
				-	Tokens []int `json:"tokens"`
			
 
				-}
			
 
				-
			
 
				-type DetokenizeRequest struct {
			
 
				-	Tokens []int `json:"tokens"`
			
 
				-}
			
 
				-
			
 
				-type DetokenizeResponse struct {
			
 
				-	Content string `json:"content"`
			
 
				-}
			
 
				-
			
 
				-type EmbeddingRequest struct {
			
 
				-	Content string `json:"content"`
			
 
				-}
			
 
				-
			
 
				-type EmbeddingResponse struct {
			
 
				-	Embedding []float64 `json:"embedding"`
			
 
				-}
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,183 +1,15 @@
 
				 package llm
			
 
				 
			
 
				-import (
			
 
				-	"context"
			
 
				-	"fmt"
			
 
				-	"log/slog"
			
 
				-	"os"
			
 
				-	"slices"
			
 
				-	"strings"
			
 
				-
			
 
				-	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/format"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				-)
			
 
				-
			
 
				-type LLM interface {
			
 
				-	Predict(context.Context, PredictOpts, func(PredictResult)) error
			
 
				-	Embedding(context.Context, string) ([]float64, error)
			
 
				-	Encode(context.Context, string) ([]int, error)
			
 
				-	Decode(context.Context, []int) (string, error)
			
 
				-	Close()
			
 
				-}
			
 
				-
			
 
				-var cpuOnlyFamilies = []string{
			
 
				-	"mamba",
			
 
				-}
			
 
				-
			
 
				-func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				-	if _, err := os.Stat(model); err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				-
			
 
				-	f, err := os.Open(model)
			
 
				-	if err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				-	defer f.Close()
			
 
				-
			
 
				-	ggml, _, err := DecodeGGML(f)
			
 
				-	if err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				-
			
 
				-	if opts.NumCtx > int(ggml.KV().ContextLength()) {
			
 
				-		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
			
 
				-		opts.NumCtx = int(ggml.KV().ContextLength())
			
 
				-	}
			
 
				-
			
 
				-	if opts.NumCtx < 4 {
			
 
				-		opts.NumCtx = 4
			
 
				-	}
			
 
				-
			
 
				-	availableMemory, _ := gpu.CheckVRAM()
			
 
				-	info := gpu.GetGPUInfo()
			
 
				-
			
 
				-	usedMemory := info.MinimumMemory
			
 
				-	for _, projector := range projectors {
			
 
				-		usedMemory += projectorMemoryRequirements(projector)
			
 
				-
			
 
				-		// multimodal models require at least 2048 context
			
 
				-		opts.NumCtx = max(opts.NumCtx, 2048)
			
 
				-	}
			
 
				-
			
 
				-	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
			
 
				-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
			
 
				-
			
 
				-	// this amount is the overhead + tensors in memory
			
 
				-	// TODO: get this from the llama.cpp's graph calculations instead of
			
 
				-	// estimating it's 1/6 * kv_cache_size * num_gqa
			
 
				-	graph := int64(ggml.KV().GQA()) * kv / 6
			
 
				-	usedMemory += graph
			
 
				-
			
 
				-	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
			
 
				-		info.Library = "cpu"
			
 
				-	}
			
 
				-
			
 
				-	requiredMemory := usedMemory
			
 
				-
			
 
				-	var layers int
			
 
				-	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
			
 
				-		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
			
 
				-		requiredMemory += layerMemory
			
 
				-
			
 
				-		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
			
 
				-			usedMemory += layerMemory
			
 
				-			layers++
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	memOutputLayer := ggml.LayerSize("output.")
			
 
				-	requiredMemory += memOutputLayer
			
 
				-
			
 
				-	// only offload output layer if all repeating layers are offloaded
			
 
				-	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
			
 
				-		usedMemory += memOutputLayer
			
 
				-		layers++
			
 
				-	}
			
 
				-
			
 
				-	slog.Info(
			
 
				-		"offload to gpu",
			
 
				-		"layers", layers,
			
 
				-		"required", format.HumanBytes2(requiredMemory),
			
 
				-		"used", format.HumanBytes2(usedMemory),
			
 
				-		"available", format.HumanBytes2(availableMemory),
			
 
				-		"kv", format.HumanBytes2(kv),
			
 
				-		"graph", format.HumanBytes2(graph),
			
 
				-	)
			
 
				-
			
 
				-	if opts.NumGPU < 0 && info.Library != "cpu" {
			
 
				-		opts.NumGPU = layers
			
 
				-	}
			
 
				-
			
 
				-	return newLlmServer(info, model, adapters, projectors, opts)
			
 
				-}
			
 
				-
			
 
				-func projectorMemoryRequirements(filename string) int64 {
			
 
				-	file, err := os.Open(filename)
			
 
				-	if err != nil {
			
 
				-		return 0
			
 
				-	}
			
 
				-	defer file.Close()
			
 
				-
			
 
				-	ggml, _, err := DecodeGGML(file)
			
 
				-	if err != nil {
			
 
				-		return 0
			
 
				-	}
			
 
				-
			
 
				-	prefixes := make(map[string]struct{})
			
 
				-	for _, layer := range ggml.Tensors() {
			
 
				-		parts := strings.Split(layer.Name, ".")
			
 
				-		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
			
 
				-	}
			
 
				-
			
 
				-	var ask int64
			
 
				-	for prefix := range prefixes {
			
 
				-		ask += ggml.LayerSize(prefix)
			
 
				-	}
			
 
				-
			
 
				-	return ask
			
 
				-}
			
 
				-
			
 
				-// Give any native cgo implementations an opportunity to initialize
			
 
				-func Init() error {
			
 
				-	return nativeInit()
			
 
				-}
			
 
				-
			
 
				-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				-	dynLibs := getDynLibs(gpuInfo)
			
 
				-
			
 
				-	// Check to see if the user has requested a specific library instead of auto-detecting
			
 
				-	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
			
 
				-	if demandLib != "" {
			
 
				-		libPath := availableDynLibs[demandLib]
			
 
				-		if libPath == "" {
			
 
				-			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
			
 
				-		} else {
			
 
				-			slog.Info(fmt.Sprintf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib))
			
 
				-			dynLibs = []string{libPath}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// We stage into a temp directory, and if we've been idle for a while, it may have been reaped
			
 
				-	_, err := os.Stat(dynLibs[0])
			
 
				-	if err != nil {
			
 
				-		slog.Info(fmt.Sprintf("%s has disappeared, reloading libraries", dynLibs[0]))
			
 
				-		err = nativeInit()
			
 
				-		if err != nil {
			
 
				-			return nil, err
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	err2 := fmt.Errorf("unable to locate suitable llm library")
			
 
				-	for _, dynLib := range dynLibs {
			
 
				-		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
			
 
				-		if err == nil {
			
 
				-			return srv, nil
			
 
				-		}
			
 
				-		slog.Warn(fmt.Sprintf("Failed to load dynamic library %s  %s", dynLib, err))
			
 
				-		err2 = err
			
 
				-	}
			
 
				-
			
 
				-	return nil, err2
			
 
				+// #cgo CFLAGS: -Illama.cpp
			
 
				+// #cgo darwin,arm64 LDFLAGS: ${SRCDIR}/build/darwin/arm64_static/libllama.a -lstdc++
			
 
				+// #cgo darwin,amd64 LDFLAGS: ${SRCDIR}/build/darwin/x86_64_static/libllama.a -lstdc++
			
 
				+// #cgo windows,amd64 LDFLAGS: ${SRCDIR}/build/windows/amd64_static/libllama.a -static -lstdc++
			
 
				+// #cgo linux,amd64 LDFLAGS: ${SRCDIR}/build/linux/x86_64_static/libllama.a -lstdc++
			
 
				+// #cgo linux,arm64 LDFLAGS: ${SRCDIR}/build/linux/arm64_static/libllama.a -lstdc++
			
 
				+// #include "llama.h"
			
 
				+import "C"
			
 
				+
			
 
				+// SystemInfo is an unused example of calling llama.cpp functions using CGo
			
 
				+func SystemInfo() string {
			
 
				+	return C.GoString(C.llama_print_system_info())
			
 
				 }
			
--- a/llm/llm_darwin_amd64.go
+++ b/llm/llm_darwin_amd64.go
@@ -4,5 +4,5 @@ import (
 
				 	"embed"
			
 
				 )
			
 
				 
			
 
				-//go:embed llama.cpp/build/linux/*/*/lib/*
			
 
				+//go:embed build/darwin/x86_64/*/bin/*
			
 
				 var libEmbed embed.FS
			
--- a/llm/llm_darwin_arm64.go
+++ b/llm/llm_darwin_arm64.go
@@ -4,5 +4,5 @@ import (
 
				 	"embed"
			
 
				 )
			
 
				 
			
 
				-//go:embed llama.cpp/build/windows/*/*/lib/*.dll*
			
 
				+//go:embed build/darwin/arm64/*/bin/*
			
 
				 var libEmbed embed.FS
			
--- a/llm/llm_linux.go
+++ b/llm/llm_linux.go
@@ -0,0 +1,6 @@
 
				+package llm
			
 
				+
			
 
				+import "embed"
			
 
				+
			
 
				+//go:embed build/linux/*/*/bin/*
			
 
				+var libEmbed embed.FS
			
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -0,0 +1,6 @@
 
				+package llm
			
 
				+
			
 
				+import "embed"
			
 
				+
			
 
				+//go:embed build/windows/*/*/bin/*
			
 
				+var libEmbed embed.FS
			
--- a/llm/payload.go
+++ b/llm/payload.go
@@ -0,0 +1,211 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"compress/gzip"
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"io/fs"
			
 
				+	"log/slog"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"strings"
			
 
				+
			
 
				+	"golang.org/x/exp/slices"
			
 
				+	"golang.org/x/sync/errgroup"
			
 
				+
			
 
				+	"github.com/ollama/ollama/gpu"
			
 
				+)
			
 
				+
			
 
				+var errPayloadMissing = fmt.Errorf("expected payloads not included in this build of ollama")
			
 
				+
			
 
				+func Init() error {
			
 
				+	payloadsDir, err := gpu.PayloadsDir()
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+
			
 
				+	slog.Info("extracting embedded files", "dir", payloadsDir)
			
 
				+	binGlob := "build/*/*/*/bin/*"
			
 
				+
			
 
				+	// extract server libraries
			
 
				+	err = extractFiles(payloadsDir, binGlob)
			
 
				+	if err != nil {
			
 
				+		return fmt.Errorf("extract binaries: %v", err)
			
 
				+	}
			
 
				+
			
 
				+	var variants []string
			
 
				+	for v := range availableServers() {
			
 
				+		variants = append(variants, v)
			
 
				+	}
			
 
				+	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
			
 
				+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+// binary names may contain an optional variant separated by '_'
			
 
				+// For example, "ollama_rocm_v6" and "ollama_rocm_v5" or "ollama_cpu" and "ollama_cpu_avx2"
			
 
				+// Any library without a variant is the lowest common denominator
			
 
				+func availableServers() map[string]string {
			
 
				+	payloadsDir, err := gpu.PayloadsDir()
			
 
				+	if err != nil {
			
 
				+		slog.Error("payload lookup error", "error", err)
			
 
				+		return nil
			
 
				+	}
			
 
				+
			
 
				+	// glob payloadsDir for files that start with ollama_
			
 
				+	pattern := filepath.Join(payloadsDir, "*")
			
 
				+
			
 
				+	files, err := filepath.Glob(pattern)
			
 
				+	if err != nil {
			
 
				+		slog.Debug("could not glob", "pattern", pattern, "error", err)
			
 
				+		return nil
			
 
				+	}
			
 
				+
			
 
				+	servers := make(map[string]string)
			
 
				+	for _, file := range files {
			
 
				+		slog.Debug("availableServers : found", "file", file)
			
 
				+		servers[filepath.Base(file)] = file
			
 
				+	}
			
 
				+
			
 
				+	return servers
			
 
				+}
			
 
				+
			
 
				+// serversForGpu returns a list of compatible servers give the provided GPU
			
 
				+// info, ordered by performance. assumes Init() has been called
			
 
				+// TODO - switch to metadata based mapping
			
 
				+func serversForGpu(info gpu.GpuInfo) []string {
			
 
				+	// glob workDir for files that start with ollama_
			
 
				+	availableServers := availableServers()
			
 
				+	requested := info.Library
			
 
				+	if info.Variant != "" {
			
 
				+		requested += "_" + info.Variant
			
 
				+	}
			
 
				+
			
 
				+	servers := []string{}
			
 
				+
			
 
				+	// exact match first
			
 
				+	for a := range availableServers {
			
 
				+		if a == requested {
			
 
				+			servers = []string{a}
			
 
				+
			
 
				+			if a == "metal" {
			
 
				+				return servers
			
 
				+			}
			
 
				+
			
 
				+			break
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	alt := []string{}
			
 
				+
			
 
				+	// Then for GPUs load alternates and sort the list for consistent load ordering
			
 
				+	if info.Library != "cpu" {
			
 
				+		for a := range availableServers {
			
 
				+			if info.Library == strings.Split(a, "_")[0] && a != requested {
			
 
				+				alt = append(alt, a)
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		slices.Sort(alt)
			
 
				+		servers = append(servers, alt...)
			
 
				+	}
			
 
				+
			
 
				+	// Load up the best CPU variant if not primary requested
			
 
				+	if info.Library != "cpu" {
			
 
				+		variant := gpu.GetCPUVariant()
			
 
				+		// If no variant, then we fall back to default
			
 
				+		// If we have a variant, try that if we find an exact match
			
 
				+		// Attempting to run the wrong CPU instructions will panic the
			
 
				+		// process
			
 
				+		if variant != "" {
			
 
				+			for cmp := range availableServers {
			
 
				+				if cmp == "cpu_"+variant {
			
 
				+					servers = append(servers, cmp)
			
 
				+					break
			
 
				+				}
			
 
				+			}
			
 
				+		} else {
			
 
				+			servers = append(servers, "cpu")
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if len(servers) == 0 {
			
 
				+		servers = []string{"cpu"}
			
 
				+	}
			
 
				+
			
 
				+	return servers
			
 
				+}
			
 
				+
			
 
				+// extract extracts the embedded files to the target directory
			
 
				+func extractFiles(targetDir string, glob string) error {
			
 
				+	files, err := fs.Glob(libEmbed, glob)
			
 
				+	if err != nil || len(files) == 0 {
			
 
				+		return errPayloadMissing
			
 
				+	}
			
 
				+
			
 
				+	if err := os.MkdirAll(targetDir, 0o755); err != nil {
			
 
				+		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
			
 
				+	}
			
 
				+
			
 
				+	g := new(errgroup.Group)
			
 
				+
			
 
				+	// build/$OS/$GOARCH/$VARIANT/{bin,lib}/$FILE
			
 
				+	for _, file := range files {
			
 
				+		filename := file
			
 
				+
			
 
				+		variant := filepath.Base(filepath.Dir(filepath.Dir(filename)))
			
 
				+
			
 
				+		slog.Debug("extracting", "variant", variant, "file", filename)
			
 
				+
			
 
				+		g.Go(func() error {
			
 
				+			srcf, err := libEmbed.Open(filename)
			
 
				+			if err != nil {
			
 
				+				return err
			
 
				+			}
			
 
				+			defer srcf.Close()
			
 
				+
			
 
				+			src := io.Reader(srcf)
			
 
				+			if strings.HasSuffix(filename, ".gz") {
			
 
				+				src, err = gzip.NewReader(src)
			
 
				+				if err != nil {
			
 
				+					return fmt.Errorf("decompress payload %s: %v", filename, err)
			
 
				+				}
			
 
				+				filename = strings.TrimSuffix(filename, ".gz")
			
 
				+			}
			
 
				+
			
 
				+			variantDir := filepath.Join(targetDir, variant)
			
 
				+			if err := os.MkdirAll(variantDir, 0o755); err != nil {
			
 
				+				return fmt.Errorf("extractFiles could not mkdir %s: %v", variantDir, err)
			
 
				+			}
			
 
				+
			
 
				+			base := filepath.Base(filename)
			
 
				+			destFilename := filepath.Join(variantDir, base)
			
 
				+
			
 
				+			_, err = os.Stat(destFilename)
			
 
				+			switch {
			
 
				+			case errors.Is(err, os.ErrNotExist):
			
 
				+				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
			
 
				+				if err != nil {
			
 
				+					return fmt.Errorf("write payload %s: %v", filename, err)
			
 
				+				}
			
 
				+				defer destFile.Close()
			
 
				+				if _, err := io.Copy(destFile, src); err != nil {
			
 
				+					return fmt.Errorf("copy payload %s: %v", filename, err)
			
 
				+				}
			
 
				+			case err != nil:
			
 
				+				return fmt.Errorf("stat payload %s: %v", filename, err)
			
 
				+			}
			
 
				+			return nil
			
 
				+		})
			
 
				+	}
			
 
				+
			
 
				+	err = g.Wait()
			
 
				+	if err != nil {
			
 
				+		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
			
 
				+		gpu.Cleanup()
			
 
				+		return err
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -1,233 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"compress/gzip"
			
 
				-	"errors"
			
 
				-	"fmt"
			
 
				-	"io"
			
 
				-	"io/fs"
			
 
				-	"log/slog"
			
 
				-	"os"
			
 
				-	"path/filepath"
			
 
				-	"runtime"
			
 
				-	"strings"
			
 
				-	"sync"
			
 
				-
			
 
				-	"golang.org/x/exp/slices"
			
 
				-	"golang.org/x/sync/errgroup"
			
 
				-
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				-)
			
 
				-
			
 
				-// Libraries names may contain an optional variant separated by '_'
			
 
				-// For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
			
 
				-// Any library without a variant is the lowest common denominator
			
 
				-var availableDynLibs = map[string]string{}
			
 
				-
			
 
				-const pathComponentCount = 7
			
 
				-
			
 
				-// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
			
 
				-func getDynLibs(gpuInfo gpu.GpuInfo) []string {
			
 
				-	// Short circuit if we know we're using the default built-in (darwin only)
			
 
				-	if gpuInfo.Library == "default" {
			
 
				-		return []string{"default"}
			
 
				-	}
			
 
				-	// TODO - temporary until we have multiple CPU variations for Darwin
			
 
				-	// Short circuit on darwin with metal only
			
 
				-	if len(availableDynLibs) == 1 {
			
 
				-		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
			
 
				-			return []string{availableDynLibs["metal"]}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	exactMatch := ""
			
 
				-	dynLibs := []string{}
			
 
				-	altDynLibs := []string{}
			
 
				-	requested := gpuInfo.Library
			
 
				-	if gpuInfo.Variant != "" {
			
 
				-		requested += "_" + gpuInfo.Variant
			
 
				-	}
			
 
				-	// Try to find an exact match
			
 
				-	for cmp := range availableDynLibs {
			
 
				-		if requested == cmp {
			
 
				-			exactMatch = cmp
			
 
				-			dynLibs = []string{availableDynLibs[cmp]}
			
 
				-			break
			
 
				-		}
			
 
				-	}
			
 
				-	// Then for GPUs load alternates and sort the list for consistent load ordering
			
 
				-	if gpuInfo.Library != "cpu" {
			
 
				-		for cmp := range availableDynLibs {
			
 
				-			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
			
 
				-				altDynLibs = append(altDynLibs, cmp)
			
 
				-			}
			
 
				-		}
			
 
				-		slices.Sort(altDynLibs)
			
 
				-		for _, altDynLib := range altDynLibs {
			
 
				-			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// Load up the best CPU variant if not primary requested
			
 
				-	if gpuInfo.Library != "cpu" {
			
 
				-		variant := gpu.GetCPUVariant()
			
 
				-		// If no variant, then we fall back to default
			
 
				-		// If we have a variant, try that if we find an exact match
			
 
				-		// Attempting to run the wrong CPU instructions will panic the
			
 
				-		// process
			
 
				-		if variant != "" {
			
 
				-			for cmp := range availableDynLibs {
			
 
				-				if cmp == "cpu_"+variant {
			
 
				-					dynLibs = append(dynLibs, availableDynLibs[cmp])
			
 
				-					break
			
 
				-				}
			
 
				-			}
			
 
				-		} else {
			
 
				-			dynLibs = append(dynLibs, availableDynLibs["cpu"])
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	// Finally, if we didn't find any matches, LCD CPU FTW
			
 
				-	if len(dynLibs) == 0 {
			
 
				-		dynLibs = []string{availableDynLibs["cpu"]}
			
 
				-	}
			
 
				-	slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
			
 
				-	return dynLibs
			
 
				-}
			
 
				-
			
 
				-func rocmDynLibPresent() bool {
			
 
				-	for dynLibName := range availableDynLibs {
			
 
				-		if strings.HasPrefix(dynLibName, "rocm") {
			
 
				-			return true
			
 
				-		}
			
 
				-	}
			
 
				-	return false
			
 
				-}
			
 
				-
			
 
				-func nativeInit() error {
			
 
				-	payloadsDir, err := gpu.PayloadsDir()
			
 
				-	if err != nil {
			
 
				-		return err
			
 
				-	}
			
 
				-
			
 
				-	slog.Info(fmt.Sprintf("Extracting dynamic libraries to %s ...", payloadsDir))
			
 
				-
			
 
				-	libs, err := extractDynamicLibs(payloadsDir, "llama.cpp/build/*/*/*/lib/*")
			
 
				-	if err != nil {
			
 
				-		if errors.Is(err, payloadMissing) {
			
 
				-			slog.Info(fmt.Sprintf("%s", payloadMissing))
			
 
				-			return nil
			
 
				-		}
			
 
				-		return err
			
 
				-	}
			
 
				-	for _, lib := range libs {
			
 
				-		// The last dir component is the variant name
			
 
				-		variant := filepath.Base(filepath.Dir(lib))
			
 
				-		availableDynLibs[variant] = lib
			
 
				-	}
			
 
				-
			
 
				-	if err := verifyDriverAccess(); err != nil {
			
 
				-		return err
			
 
				-	}
			
 
				-
			
 
				-	// Report which dynamic libraries we have loaded to assist troubleshooting
			
 
				-	variants := make([]string, len(availableDynLibs))
			
 
				-	i := 0
			
 
				-	for variant := range availableDynLibs {
			
 
				-		variants[i] = variant
			
 
				-		i++
			
 
				-	}
			
 
				-	slog.Info(fmt.Sprintf("Dynamic LLM libraries %v", variants))
			
 
				-	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
			
 
				-
			
 
				-	return nil
			
 
				-}
			
 
				-
			
 
				-func extractDynamicLibs(payloadsDir, glob string) ([]string, error) {
			
 
				-	files, err := fs.Glob(libEmbed, glob)
			
 
				-	if err != nil || len(files) == 0 {
			
 
				-		return nil, payloadMissing
			
 
				-	}
			
 
				-
			
 
				-	var mu sync.Mutex
			
 
				-	var libs []string
			
 
				-	var g errgroup.Group
			
 
				-	for _, file := range files {
			
 
				-		pathComps := strings.Split(file, "/")
			
 
				-		if len(pathComps) != pathComponentCount {
			
 
				-			slog.Error(fmt.Sprintf("unexpected payload components: %v", pathComps))
			
 
				-			continue
			
 
				-		}
			
 
				-
			
 
				-		file := file
			
 
				-		g.Go(func() error {
			
 
				-			// llama.cpp/build/$OS/$GOARCH/$VARIANT/lib/$LIBRARY
			
 
				-			// Include the variant in the path to avoid conflicts between multiple server libs
			
 
				-			targetDir := filepath.Join(payloadsDir, pathComps[pathComponentCount-3])
			
 
				-			srcFile, err := libEmbed.Open(file)
			
 
				-			if err != nil {
			
 
				-				return fmt.Errorf("read payload %s: %v", file, err)
			
 
				-			}
			
 
				-			defer srcFile.Close()
			
 
				-			if err := os.MkdirAll(targetDir, 0o755); err != nil {
			
 
				-				return fmt.Errorf("create payload lib dir %s: %v", payloadsDir, err)
			
 
				-			}
			
 
				-			src := io.Reader(srcFile)
			
 
				-			filename := file
			
 
				-			if strings.HasSuffix(file, ".gz") {
			
 
				-				src, err = gzip.NewReader(src)
			
 
				-				if err != nil {
			
 
				-					return fmt.Errorf("decompress payload %s: %v", file, err)
			
 
				-				}
			
 
				-				filename = strings.TrimSuffix(filename, ".gz")
			
 
				-			}
			
 
				-
			
 
				-			destFile := filepath.Join(targetDir, filepath.Base(filename))
			
 
				-			if strings.Contains(destFile, "server") {
			
 
				-				mu.Lock()
			
 
				-				libs = append(libs, destFile)
			
 
				-				mu.Unlock()
			
 
				-			}
			
 
				-
			
 
				-			destFp, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
			
 
				-			if err != nil {
			
 
				-				return fmt.Errorf("write payload %s: %v", file, err)
			
 
				-			}
			
 
				-			defer destFp.Close()
			
 
				-			if _, err := io.Copy(destFp, src); err != nil {
			
 
				-				return fmt.Errorf("copy payload %s: %v", file, err)
			
 
				-			}
			
 
				-			return nil
			
 
				-		})
			
 
				-	}
			
 
				-	err = g.Wait()
			
 
				-	if err != nil {
			
 
				-		// If we fail to extract, the payload dir is unusable, so cleanup whatever we extracted
			
 
				-		gpu.Cleanup()
			
 
				-		return nil, err
			
 
				-	}
			
 
				-	return libs, nil
			
 
				-}
			
 
				-
			
 
				-func verifyDriverAccess() error {
			
 
				-	if runtime.GOOS != "linux" {
			
 
				-		return nil
			
 
				-	}
			
 
				-	// Only check ROCm access if we have the dynamic lib loaded
			
 
				-	if rocmDynLibPresent() {
			
 
				-		// Verify we have permissions - either running as root, or we have group access to the driver
			
 
				-		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
			
 
				-		if err != nil {
			
 
				-			if errors.Is(err, fs.ErrPermission) {
			
 
				-				return fmt.Errorf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
			
 
				-			} else if errors.Is(err, fs.ErrNotExist) {
			
 
				-				// expected behavior without a radeon card
			
 
				-				return nil
			
 
				-			}
			
 
				-
			
 
				-			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
			
 
				-		}
			
 
				-		fd.Close()
			
 
				-	}
			
 
				-	return nil
			
 
				-}
			
--- a/llm/payload_darwin_amd64.go
+++ b/llm/payload_darwin_amd64.go
@@ -1,8 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"embed"
			
 
				-)
			
 
				-
			
 
				-//go:embed llama.cpp/build/darwin/x86_64/*/lib/*.dylib*
			
 
				-var libEmbed embed.FS
			
--- a/llm/payload_darwin_arm64.go
+++ b/llm/payload_darwin_arm64.go
@@ -1,8 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"embed"
			
 
				-)
			
 
				-
			
 
				-//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/arm64/*/lib/*.dylib*
			
 
				-var libEmbed embed.FS
			
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -1,58 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"testing"
			
 
				-
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				-	"github.com/stretchr/testify/assert"
			
 
				-)
			
 
				-
			
 
				-func TestGetDynLibs(t *testing.T) {
			
 
				-	availableDynLibs = map[string]string{
			
 
				-		"cpu": "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, false, rocmDynLibPresent())
			
 
				-	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, availableDynLibs["cpu"], res[0])
			
 
				-
			
 
				-	variant := gpu.GetCPUVariant()
			
 
				-	if variant != "" {
			
 
				-		variant = "_" + variant
			
 
				-	}
			
 
				-	availableDynLibs = map[string]string{
			
 
				-		"rocm_v5":       "X_rocm_v5",
			
 
				-		"rocm_v6":       "X_rocm_v6",
			
 
				-		"cpu" + variant: "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, true, rocmDynLibPresent())
			
 
				-	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
			
 
				-	assert.Len(t, res, 3)
			
 
				-	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
			
 
				-	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
			
 
				-
			
 
				-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				-	assert.Len(t, res, 3)
			
 
				-	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
			
 
				-	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
			
 
				-
			
 
				-	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
			
 
				-
			
 
				-	res = getDynLibs(gpu.GpuInfo{Library: "default"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, "default", res[0])
			
 
				-
			
 
				-	availableDynLibs = map[string]string{
			
 
				-		"rocm":          "X_rocm_v5",
			
 
				-		"cpu" + variant: "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, true, rocmDynLibPresent())
			
 
				-	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				-	assert.Len(t, res, 2)
			
 
				-	assert.Equal(t, availableDynLibs["rocm"], res[0])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
			
 
				-}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -0,0 +1,854 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"bufio"
			
 
				+	"bytes"
			
 
				+	"context"
			
 
				+	"encoding/json"
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+	"io"
			
 
				+	"log"
			
 
				+	"log/slog"
			
 
				+	"math/rand"
			
 
				+	"net"
			
 
				+	"net/http"
			
 
				+	"os"
			
 
				+	"os/exec"
			
 
				+	"path/filepath"
			
 
				+	"runtime"
			
 
				+	"slices"
			
 
				+	"strconv"
			
 
				+	"strings"
			
 
				+	"time"
			
 
				+
			
 
				+	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/format"
			
 
				+	"github.com/ollama/ollama/gpu"
			
 
				+)
			
 
				+
			
 
				+// LlamaServer is an instance of the llama.cpp server
			
 
				+type LlamaServer struct {
			
 
				+	port    int
			
 
				+	cmd     *exec.Cmd
			
 
				+	done    chan error // Channel to signal when the process exits
			
 
				+	status  *StatusWriter
			
 
				+	options *api.Options
			
 
				+}
			
 
				+
			
 
				+var cpuOnlyFamilies = []string{
			
 
				+	"mamba",
			
 
				+}
			
 
				+
			
 
				+func NewLlamaServer(model string, adapters, projectors []string, opts *api.Options) (*LlamaServer, error) {
			
 
				+	if _, err := os.Stat(model); err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	f, err := os.Open(model)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+	defer f.Close()
			
 
				+
			
 
				+	ggml, _, err := DecodeGGML(f)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	}
			
 
				+
			
 
				+	if opts.NumCtx > int(ggml.KV().ContextLength()) {
			
 
				+		slog.Warn("requested context length is greater than model max context length", "requested", opts.NumCtx, "model", ggml.KV().ContextLength())
			
 
				+		opts.NumCtx = int(ggml.KV().ContextLength())
			
 
				+	}
			
 
				+
			
 
				+	if opts.NumCtx < 4 {
			
 
				+		opts.NumCtx = 4
			
 
				+	}
			
 
				+
			
 
				+	availableMemory, _ := gpu.CheckVRAM()
			
 
				+	info := gpu.GetGPUInfo()
			
 
				+
			
 
				+	usedMemory := info.MinimumMemory
			
 
				+	for _, projector := range projectors {
			
 
				+		usedMemory += projectorMemoryRequirements(projector)
			
 
				+
			
 
				+		// multimodal models require at least 2048 context
			
 
				+		opts.NumCtx = max(opts.NumCtx, 2048)
			
 
				+	}
			
 
				+
			
 
				+	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
			
 
				+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
			
 
				+
			
 
				+	// this amount is the overhead + tensors in memory
			
 
				+	// TODO: get this from the llama.cpp's graph calculations instead of
			
 
				+	// estimating it's 1/6 * kv_cache_size * num_gqa
			
 
				+	graph := int64(ggml.KV().GQA()) * kv / 6
			
 
				+	usedMemory += graph
			
 
				+
			
 
				+	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
			
 
				+		info.Library = "cpu"
			
 
				+	}
			
 
				+
			
 
				+	requiredMemory := usedMemory
			
 
				+
			
 
				+	var layers int
			
 
				+	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
			
 
				+		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
			
 
				+		requiredMemory += layerMemory
			
 
				+
			
 
				+		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
			
 
				+			usedMemory += layerMemory
			
 
				+			layers++
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	memOutputLayer := ggml.LayerSize("output.")
			
 
				+	requiredMemory += memOutputLayer
			
 
				+
			
 
				+	// only offload output layer if all repeating layers are offloaded
			
 
				+	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
			
 
				+		usedMemory += memOutputLayer
			
 
				+		layers++
			
 
				+	}
			
 
				+
			
 
				+	slog.Info(
			
 
				+		"offload to gpu",
			
 
				+		"layers", layers,
			
 
				+		"required", format.HumanBytes2(requiredMemory),
			
 
				+		"used", format.HumanBytes2(usedMemory),
			
 
				+		"available", format.HumanBytes2(availableMemory),
			
 
				+		"kv", format.HumanBytes2(kv),
			
 
				+		"graph", format.HumanBytes2(graph),
			
 
				+	)
			
 
				+
			
 
				+	if opts.NumGPU < 0 && info.Library != "cpu" {
			
 
				+		opts.NumGPU = layers
			
 
				+	}
			
 
				+
			
 
				+	if len(adapters) > 1 {
			
 
				+		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
			
 
				+	}
			
 
				+
			
 
				+	availableServers := availableServers()
			
 
				+	servers := serversForGpu(info)
			
 
				+
			
 
				+	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
			
 
				+	if demandLib != "" {
			
 
				+		serverPath := availableServers[demandLib]
			
 
				+		if serverPath == "" {
			
 
				+			slog.Info(fmt.Sprintf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib))
			
 
				+		} else {
			
 
				+			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
			
 
				+			servers = []string{demandLib}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if len(servers) == 0 {
			
 
				+		return nil, fmt.Errorf("no servers found for %v", info)
			
 
				+	}
			
 
				+
			
 
				+	params := []string{
			
 
				+		"--model", model,
			
 
				+		"--ctx-size", fmt.Sprintf("%d", opts.NumCtx),
			
 
				+		"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
			
 
				+		"--embedding",
			
 
				+	}
			
 
				+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
			
 
				+		params = append(params, "--log-format", "json")
			
 
				+	} else {
			
 
				+		params = append(params, "--log-disable")
			
 
				+	}
			
 
				+
			
 
				+	if opts.NumGPU > 0 {
			
 
				+		params = append(params, "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU))
			
 
				+	}
			
 
				+
			
 
				+	if debug := os.Getenv("OLLAMA_DEBUG"); debug != "" {
			
 
				+		params = append(params, "--verbose")
			
 
				+	}
			
 
				+
			
 
				+	if opts.MainGPU > 0 {
			
 
				+		params = append(params, "--main-gpu", fmt.Sprintf("%d", opts.MainGPU))
			
 
				+	}
			
 
				+
			
 
				+	if opts.RopeFrequencyBase > 0 {
			
 
				+		params = append(params, "--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase))
			
 
				+	}
			
 
				+
			
 
				+	if opts.RopeFrequencyScale > 0 {
			
 
				+		params = append(params, "--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale))
			
 
				+	}
			
 
				+
			
 
				+	if len(adapters) > 0 {
			
 
				+		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
			
 
				+		params = append(params, "--lora", adapters[0])
			
 
				+	}
			
 
				+
			
 
				+	if len(projectors) > 0 {
			
 
				+		// TODO: applying multiple projectors is not supported by the llama.cpp server yet
			
 
				+		params = append(params, "--mmproj", projectors[0])
			
 
				+	}
			
 
				+
			
 
				+	if opts.NumThread > 0 {
			
 
				+		params = append(params, "--threads", fmt.Sprintf("%d", opts.NumThread))
			
 
				+	}
			
 
				+
			
 
				+	if !opts.F16KV {
			
 
				+		params = append(params, "--memory-f32")
			
 
				+	}
			
 
				+
			
 
				+	if opts.UseMLock {
			
 
				+		params = append(params, "--mlock")
			
 
				+	}
			
 
				+
			
 
				+	if !opts.UseMMap {
			
 
				+		params = append(params, "--no-mmap")
			
 
				+	}
			
 
				+
			
 
				+	if opts.UseNUMA {
			
 
				+		params = append(params, "--numa")
			
 
				+	}
			
 
				+
			
 
				+	// Loop through potential servers
			
 
				+	var finalErr error
			
 
				+	for i := 0; i < len(servers); i++ {
			
 
				+		dir := availableServers[servers[i]]
			
 
				+
			
 
				+		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
			
 
				+		port := 0
			
 
				+		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
			
 
				+			var l *net.TCPListener
			
 
				+			if l, err = net.ListenTCP("tcp", a); err == nil {
			
 
				+				port = l.Addr().(*net.TCPAddr).Port
			
 
				+				l.Close()
			
 
				+			}
			
 
				+		}
			
 
				+		if port == 0 {
			
 
				+			slog.Debug("ResolveTCPAddr failed ", "error", err)
			
 
				+			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
			
 
				+		}
			
 
				+		finalParams := append(params, "--port", strconv.Itoa(port))
			
 
				+
			
 
				+		pathEnv := "LD_LIBRARY_PATH"
			
 
				+		if runtime.GOOS == "windows" {
			
 
				+			pathEnv = "PATH"
			
 
				+		}
			
 
				+		// append the server directory to LD_LIBRARY_PATH/PATH
			
 
				+		libraryPaths := []string{dir}
			
 
				+		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
			
 
				+			// Append our runner directory to the path
			
 
				+			// This will favor system libraries over our bundled library dependencies
			
 
				+			libraryPaths = append(filepath.SplitList(libraryPath), libraryPaths...)
			
 
				+		}
			
 
				+
			
 
				+		server := filepath.Join(dir, "ollama_llama_server")
			
 
				+		if runtime.GOOS == "windows" {
			
 
				+			server = server + ".exe"
			
 
				+		}
			
 
				+
			
 
				+		s := &LlamaServer{
			
 
				+			port:    port,
			
 
				+			cmd:     exec.Command(server, finalParams...),
			
 
				+			status:  NewStatusWriter(os.Stderr),
			
 
				+			options: opts,
			
 
				+		}
			
 
				+		libEnv := fmt.Sprintf("%s=%s", pathEnv, strings.Join(libraryPaths, string(filepath.ListSeparator)))
			
 
				+		slog.Debug(libEnv)
			
 
				+		s.cmd.Env = append(os.Environ(), libEnv)
			
 
				+		s.cmd.Stdout = os.Stdout
			
 
				+		s.cmd.Stderr = s.status
			
 
				+
			
 
				+		slog.Info("starting llama server", "cmd", s.cmd.String())
			
 
				+
			
 
				+		if err = s.cmd.Start(); err != nil {
			
 
				+			msg := ""
			
 
				+			if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+				msg = s.status.LastErrMsg
			
 
				+			}
			
 
				+			err = fmt.Errorf("error starting the external llama server: %v %s", err, msg)
			
 
				+			finalErr = err
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// reap subprocess when it exits
			
 
				+		go func() {
			
 
				+			// Exit status managed via getServerStatus
			
 
				+			_ = s.cmd.Wait()
			
 
				+		}()
			
 
				+
			
 
				+		if err = s.waitUntilRunning(); err != nil {
			
 
				+			slog.Error("error starting llama server", "server", servers[i], "error", err)
			
 
				+			s.Close()
			
 
				+			finalErr = err
			
 
				+			continue
			
 
				+		}
			
 
				+		return s, nil
			
 
				+	}
			
 
				+
			
 
				+	slog.Error("unable to load any llama server", "error", finalErr)
			
 
				+	return nil, finalErr
			
 
				+}
			
 
				+
			
 
				+func projectorMemoryRequirements(filename string) int64 {
			
 
				+	file, err := os.Open(filename)
			
 
				+	if err != nil {
			
 
				+		return 0
			
 
				+	}
			
 
				+	defer file.Close()
			
 
				+
			
 
				+	ggml, _, err := DecodeGGML(file)
			
 
				+	if err != nil {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	prefixes := make(map[string]struct{})
			
 
				+	for _, layer := range ggml.Tensors() {
			
 
				+		parts := strings.Split(layer.Name, ".")
			
 
				+		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
			
 
				+	}
			
 
				+
			
 
				+	var ask int64
			
 
				+	for prefix := range prefixes {
			
 
				+		ask += ggml.LayerSize(prefix)
			
 
				+	}
			
 
				+
			
 
				+	return ask
			
 
				+}
			
 
				+
			
 
				+type ServerStatus int
			
 
				+
			
 
				+const ( // iota is reset to 0
			
 
				+	ServerStatusReady ServerStatus = iota
			
 
				+	ServerStatusNoSlotsAvaialble
			
 
				+	ServerStatusLoadingModel
			
 
				+	ServerStatusNotResponding
			
 
				+	ServerStatusError
			
 
				+)
			
 
				+
			
 
				+type ServerStatusResp struct {
			
 
				+	Status          string `json:"status"`
			
 
				+	SlotsIdle       int    `json:"slots_idle"`
			
 
				+	SlotsProcessing int    `json:"slots_processing"`
			
 
				+	Error           string `json:"error"`
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) getServerStatus(ctx context.Context) (ServerStatus, error) {
			
 
				+	// Fail fast if its exited
			
 
				+	if s.cmd.ProcessState != nil {
			
 
				+		msg := ""
			
 
				+		if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+			msg = s.status.LastErrMsg
			
 
				+		}
			
 
				+		return ServerStatusError, fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
			
 
				+	}
			
 
				+
			
 
				+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, fmt.Sprintf("http://127.0.0.1:%d/health", s.port), nil)
			
 
				+	if err != nil {
			
 
				+		return ServerStatusError, fmt.Errorf("error creating GET request: %v", err)
			
 
				+	}
			
 
				+	req.Header.Set("Content-Type", "application/json")
			
 
				+
			
 
				+	resp, err := http.DefaultClient.Do(req)
			
 
				+	if err != nil {
			
 
				+		if errors.Is(err, context.DeadlineExceeded) {
			
 
				+			return ServerStatusNotResponding, fmt.Errorf("server not responding")
			
 
				+		}
			
 
				+		return ServerStatusError, fmt.Errorf("health resp: %w", err)
			
 
				+	}
			
 
				+	defer resp.Body.Close()
			
 
				+
			
 
				+	body, err := io.ReadAll(resp.Body)
			
 
				+	if err != nil {
			
 
				+		return ServerStatusError, fmt.Errorf("read health request: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	var status ServerStatusResp
			
 
				+	if err := json.Unmarshal(body, &status); err != nil {
			
 
				+		return ServerStatusError, fmt.Errorf("health unmarshal encode response: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	switch status.Status {
			
 
				+	case "ok":
			
 
				+		return ServerStatusReady, nil
			
 
				+	case "no slot available":
			
 
				+		return ServerStatusNoSlotsAvaialble, nil
			
 
				+	case "loading model":
			
 
				+		return ServerStatusLoadingModel, nil
			
 
				+	default:
			
 
				+		return ServerStatusError, fmt.Errorf("server error: %+v", status)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Ping(ctx context.Context) error {
			
 
				+	_, err := s.getServerStatus(ctx)
			
 
				+	if err != nil {
			
 
				+		slog.Debug("server unhealthy", "error", err)
			
 
				+		return err
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) waitUntilRunning() error {
			
 
				+	start := time.Now()
			
 
				+	expiresAt := time.Now().Add(3 * time.Minute) // be generous with timeout, large models can take a while to load
			
 
				+	ticker := time.NewTicker(50 * time.Millisecond)
			
 
				+	defer ticker.Stop()
			
 
				+
			
 
				+	slog.Info("waiting for llama runner to start responding")
			
 
				+	var lastStatus ServerStatus = -1
			
 
				+	for {
			
 
				+		select {
			
 
				+		case err := <-s.done:
			
 
				+			msg := ""
			
 
				+			if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+				msg = s.status.LastErrMsg
			
 
				+			}
			
 
				+			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
			
 
				+		case <-ticker.C:
			
 
				+			if time.Now().After(expiresAt) {
			
 
				+				// timeout
			
 
				+				msg := ""
			
 
				+				if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+					msg = s.status.LastErrMsg
			
 
				+				}
			
 
				+				return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
			
 
				+			}
			
 
				+			if s.cmd.ProcessState != nil {
			
 
				+				msg := ""
			
 
				+				if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+					msg = s.status.LastErrMsg
			
 
				+				}
			
 
				+				return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
			
 
				+			}
			
 
				+
			
 
				+			ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
			
 
				+			defer cancel()
			
 
				+			status, err := s.getServerStatus(ctx)
			
 
				+			if err != nil && lastStatus != status {
			
 
				+				slog.Debug("server not yet available", "error", err)
			
 
				+				lastStatus = status
			
 
				+				continue
			
 
				+			}
			
 
				+
			
 
				+			switch status {
			
 
				+			case ServerStatusLoadingModel:
			
 
				+				// TODO - this state never seems to happen with the current server.cpp code (bug?)
			
 
				+				// it doesn't respond to the health endpoint until after the model is loaded
			
 
				+				slog.Debug("loading model")
			
 
				+			case ServerStatusReady:
			
 
				+				slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
			
 
				+				return nil
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const jsonGrammar = `
			
 
				+root   ::= object
			
 
				+value  ::= object | array | string | number | ("true" | "false" | "null") ws
			
 
				+
			
 
				+object ::=
			
 
				+  "{" ws (
			
 
				+            string ":" ws value
			
 
				+    ("," ws string ":" ws value)*
			
 
				+  )? "}" ws
			
 
				+
			
 
				+array  ::=
			
 
				+  "[" ws (
			
 
				+            value
			
 
				+    ("," ws value)*
			
 
				+  )? "]" ws
			
 
				+
			
 
				+string ::=
			
 
				+  "\"" (
			
 
				+    [^"\\] |
			
 
				+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
			
 
				+  )* "\"" ws
			
 
				+
			
 
				+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
			
 
				+
			
 
				+# Optional space: by convention, applied in this grammar after literal chars when allowed
			
 
				+ws ::= ([ \t\n] ws)?
			
 
				+`
			
 
				+
			
 
				+const maxBufferSize = 512 * format.KiloByte
			
 
				+const maxRetries = 3
			
 
				+
			
 
				+type ImageData struct {
			
 
				+	Data []byte `json:"data"`
			
 
				+	ID   int    `json:"id"`
			
 
				+}
			
 
				+
			
 
				+type completion struct {
			
 
				+	Content string `json:"content"`
			
 
				+	Model   string `json:"model"`
			
 
				+	Prompt  string `json:"prompt"`
			
 
				+	Stop    bool   `json:"stop"`
			
 
				+
			
 
				+	Timings struct {
			
 
				+		PredictedN  int     `json:"predicted_n"`
			
 
				+		PredictedMS float64 `json:"predicted_ms"`
			
 
				+		PromptN     int     `json:"prompt_n"`
			
 
				+		PromptMS    float64 `json:"prompt_ms"`
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+type CompletionRequest struct {
			
 
				+	Prompt  string
			
 
				+	Format  string
			
 
				+	Images  []ImageData
			
 
				+	Options api.Options
			
 
				+}
			
 
				+
			
 
				+type CompletionResponse struct {
			
 
				+	Content            string
			
 
				+	Done               bool
			
 
				+	PromptEvalCount    int
			
 
				+	PromptEvalDuration time.Duration
			
 
				+	EvalCount          int
			
 
				+	EvalDuration       time.Duration
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Completion(ctx context.Context, req CompletionRequest, fn func(CompletionResponse)) error {
			
 
				+	request := map[string]any{
			
 
				+		"prompt":            req.Prompt,
			
 
				+		"stream":            true,
			
 
				+		"n_predict":         req.Options.NumPredict,
			
 
				+		"n_keep":            req.Options.NumKeep,
			
 
				+		"main_gpu":          req.Options.MainGPU,
			
 
				+		"temperature":       req.Options.Temperature,
			
 
				+		"top_k":             req.Options.TopK,
			
 
				+		"top_p":             req.Options.TopP,
			
 
				+		"tfs_z":             req.Options.TFSZ,
			
 
				+		"typical_p":         req.Options.TypicalP,
			
 
				+		"repeat_last_n":     req.Options.RepeatLastN,
			
 
				+		"repeat_penalty":    req.Options.RepeatPenalty,
			
 
				+		"presence_penalty":  req.Options.PresencePenalty,
			
 
				+		"frequency_penalty": req.Options.FrequencyPenalty,
			
 
				+		"mirostat":          req.Options.Mirostat,
			
 
				+		"mirostat_tau":      req.Options.MirostatTau,
			
 
				+		"mirostat_eta":      req.Options.MirostatEta,
			
 
				+		"penalize_nl":       req.Options.PenalizeNewline,
			
 
				+		"seed":              req.Options.Seed,
			
 
				+		"stop":              req.Options.Stop,
			
 
				+		"image_data":        req.Images,
			
 
				+		"cache_prompt":      true,
			
 
				+	}
			
 
				+
			
 
				+	// Make sure the server is ready
			
 
				+	status, err := s.getServerStatus(ctx)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	} else if status != ServerStatusReady {
			
 
				+		return fmt.Errorf("unexpected server status: %d", status)
			
 
				+	}
			
 
				+
			
 
				+	if req.Format == "json" {
			
 
				+		request["grammar"] = jsonGrammar
			
 
				+		if !strings.Contains(strings.ToLower(req.Prompt), "json") {
			
 
				+			slog.Warn("Prompt does not specify that the LLM should response in JSON, but JSON format is expected. For best results specify that JSON is expected in the system prompt.")
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	retryDelay := 100 * time.Microsecond
			
 
				+	for retries := 0; retries < maxRetries; retries++ {
			
 
				+		if retries > 0 {
			
 
				+			time.Sleep(retryDelay) // wait before retrying
			
 
				+			retryDelay *= 2        // exponential backoff
			
 
				+		}
			
 
				+
			
 
				+		// Handling JSON marshaling with special characters unescaped.
			
 
				+		buffer := &bytes.Buffer{}
			
 
				+		enc := json.NewEncoder(buffer)
			
 
				+		enc.SetEscapeHTML(false)
			
 
				+
			
 
				+		if err := enc.Encode(request); err != nil {
			
 
				+			return fmt.Errorf("failed to marshal data: %v", err)
			
 
				+		}
			
 
				+
			
 
				+		endpoint := fmt.Sprintf("http://127.0.0.1:%d/completion", s.port)
			
 
				+		req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, buffer)
			
 
				+		if err != nil {
			
 
				+			return fmt.Errorf("error creating POST request: %v", err)
			
 
				+		}
			
 
				+		req.Header.Set("Content-Type", "application/json")
			
 
				+
			
 
				+		resp, err := http.DefaultClient.Do(req)
			
 
				+		if err != nil {
			
 
				+			return fmt.Errorf("POST predict: %v", err)
			
 
				+		}
			
 
				+		defer resp.Body.Close()
			
 
				+
			
 
				+		if resp.StatusCode >= 400 {
			
 
				+			bodyBytes, err := io.ReadAll(resp.Body)
			
 
				+			if err != nil {
			
 
				+				return fmt.Errorf("failed reading llm error response: %w", err)
			
 
				+			}
			
 
				+			log.Printf("llm predict error: %s", bodyBytes)
			
 
				+			return fmt.Errorf("%s", bodyBytes)
			
 
				+		}
			
 
				+
			
 
				+		scanner := bufio.NewScanner(resp.Body)
			
 
				+		buf := make([]byte, 0, maxBufferSize)
			
 
				+		scanner.Buffer(buf, maxBufferSize)
			
 
				+
			
 
				+		retryNeeded := false
			
 
				+		// keep track of the last token generated, this is used to abort if the model starts looping
			
 
				+		var lastToken string
			
 
				+		var tokenRepeat int
			
 
				+
			
 
				+		for scanner.Scan() {
			
 
				+			select {
			
 
				+			case <-ctx.Done():
			
 
				+				// This handles the request cancellation
			
 
				+				return ctx.Err()
			
 
				+			default:
			
 
				+				line := scanner.Bytes()
			
 
				+				if len(line) == 0 {
			
 
				+					continue
			
 
				+				}
			
 
				+
			
 
				+				// try again on slot unavailable
			
 
				+				if bytes.Contains(line, []byte("slot unavailable")) {
			
 
				+					retryNeeded = true
			
 
				+					break
			
 
				+				}
			
 
				+
			
 
				+				evt, ok := bytes.CutPrefix(line, []byte("data: "))
			
 
				+				if !ok {
			
 
				+					return fmt.Errorf("error parsing llm response stream: %s", line)
			
 
				+				}
			
 
				+
			
 
				+				var c completion
			
 
				+				if err := json.Unmarshal(evt, &c); err != nil {
			
 
				+					return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
			
 
				+				}
			
 
				+
			
 
				+				switch {
			
 
				+				case strings.TrimSpace(c.Content) == lastToken:
			
 
				+					tokenRepeat++
			
 
				+				default:
			
 
				+					lastToken = strings.TrimSpace(c.Content)
			
 
				+					tokenRepeat = 0
			
 
				+				}
			
 
				+
			
 
				+				// 30 picked as an arbitrary max token repeat limit, modify as needed
			
 
				+				if tokenRepeat > 30 {
			
 
				+					slog.Debug("prediction aborted, token repeat limit reached")
			
 
				+					return ctx.Err()
			
 
				+				}
			
 
				+
			
 
				+				if c.Content != "" {
			
 
				+					fn(CompletionResponse{
			
 
				+						Content: c.Content,
			
 
				+					})
			
 
				+				}
			
 
				+
			
 
				+				if c.Stop {
			
 
				+					fn(CompletionResponse{
			
 
				+						Done:               true,
			
 
				+						PromptEvalCount:    c.Timings.PromptN,
			
 
				+						PromptEvalDuration: parseDurationMs(c.Timings.PromptMS),
			
 
				+						EvalCount:          c.Timings.PredictedN,
			
 
				+						EvalDuration:       parseDurationMs(c.Timings.PredictedMS),
			
 
				+					})
			
 
				+					return nil
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if err := scanner.Err(); err != nil {
			
 
				+			if strings.Contains(err.Error(), "unexpected EOF") {
			
 
				+				s.Close()
			
 
				+				msg := ""
			
 
				+				if s.status != nil && s.status.LastErrMsg != "" {
			
 
				+					msg = s.status.LastErrMsg
			
 
				+				}
			
 
				+
			
 
				+				return fmt.Errorf("an unknown error was encountered while running the model %s", msg)
			
 
				+			}
			
 
				+			return fmt.Errorf("error reading llm response: %v", err)
			
 
				+		}
			
 
				+
			
 
				+		if !retryNeeded {
			
 
				+			return nil // success
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// should never reach here ideally
			
 
				+	return fmt.Errorf("max retries exceeded")
			
 
				+}
			
 
				+
			
 
				+type EmbeddingRequest struct {
			
 
				+	Content string `json:"content"`
			
 
				+}
			
 
				+
			
 
				+type EmbeddingResponse struct {
			
 
				+	Embedding []float64 `json:"embedding"`
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Embedding(ctx context.Context, prompt string) ([]float64, error) {
			
 
				+	// Make sure the server is ready
			
 
				+	status, err := s.getServerStatus(ctx)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	} else if status != ServerStatusReady {
			
 
				+		return nil, fmt.Errorf("unexpected server status: %d", status)
			
 
				+	}
			
 
				+
			
 
				+	data, err := json.Marshal(TokenizeRequest{Content: prompt})
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("error marshaling embed data: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data))
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("error creating embed request: %w", err)
			
 
				+	}
			
 
				+	req.Header.Set("Content-Type", "application/json")
			
 
				+
			
 
				+	resp, err := http.DefaultClient.Do(req)
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("do embedding request: %w", err)
			
 
				+	}
			
 
				+	defer resp.Body.Close()
			
 
				+
			
 
				+	body, err := io.ReadAll(resp.Body)
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("error reading embed response: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	if resp.StatusCode >= 400 {
			
 
				+		log.Printf("llm encode error: %s", body)
			
 
				+		return nil, fmt.Errorf("%s", body)
			
 
				+	}
			
 
				+
			
 
				+	var embedding EmbeddingResponse
			
 
				+	if err := json.Unmarshal(body, &embedding); err != nil {
			
 
				+		return nil, fmt.Errorf("unmarshal tokenize response: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	return embedding.Embedding, nil
			
 
				+}
			
 
				+
			
 
				+type TokenizeRequest struct {
			
 
				+	Content string `json:"content"`
			
 
				+}
			
 
				+
			
 
				+type TokenizeResponse struct {
			
 
				+	Tokens []int `json:"tokens"`
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Tokenize(ctx context.Context, content string) ([]int, error) {
			
 
				+	// Make sure the server is ready
			
 
				+	status, err := s.getServerStatus(ctx)
			
 
				+	if err != nil {
			
 
				+		return nil, err
			
 
				+	} else if status != ServerStatusReady {
			
 
				+		return nil, fmt.Errorf("unexpected server status: %d", status)
			
 
				+	}
			
 
				+
			
 
				+	data, err := json.Marshal(TokenizeRequest{Content: content})
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("marshaling encode data: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("encode request: %w", err)
			
 
				+	}
			
 
				+	req.Header.Set("Content-Type", "application/json")
			
 
				+
			
 
				+	resp, err := http.DefaultClient.Do(req)
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("do encode request: %w", err)
			
 
				+	}
			
 
				+	defer resp.Body.Close()
			
 
				+
			
 
				+	body, err := io.ReadAll(resp.Body)
			
 
				+	if err != nil {
			
 
				+		return nil, fmt.Errorf("read encode request: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	if resp.StatusCode >= 400 {
			
 
				+		log.Printf("llm encode error: %s", body)
			
 
				+		return nil, fmt.Errorf("%s", body)
			
 
				+	}
			
 
				+
			
 
				+	var encoded TokenizeResponse
			
 
				+	if err := json.Unmarshal(body, &encoded); err != nil {
			
 
				+		return nil, fmt.Errorf("unmarshal encode response: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	return encoded.Tokens, nil
			
 
				+}
			
 
				+
			
 
				+type DetokenizeRequest struct {
			
 
				+	Tokens []int `json:"tokens"`
			
 
				+}
			
 
				+
			
 
				+type DetokenizeResponse struct {
			
 
				+	Content string `json:"content"`
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
			
 
				+	// Make sure the server is ready
			
 
				+	status, err := s.getServerStatus(ctx)
			
 
				+	if err != nil {
			
 
				+		return "", err
			
 
				+	} else if status != ServerStatusReady {
			
 
				+		return "", fmt.Errorf("unexpected server status: %d", status)
			
 
				+	}
			
 
				+
			
 
				+	data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
			
 
				+	if err != nil {
			
 
				+		return "", fmt.Errorf("marshaling decode data: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
			
 
				+	if err != nil {
			
 
				+		return "", fmt.Errorf("decode request: %w", err)
			
 
				+	}
			
 
				+	req.Header.Set("Content-Type", "application/json")
			
 
				+
			
 
				+	resp, err := http.DefaultClient.Do(req)
			
 
				+	if err != nil {
			
 
				+		return "", fmt.Errorf("do decode request: %w", err)
			
 
				+	}
			
 
				+	defer resp.Body.Close()
			
 
				+
			
 
				+	body, err := io.ReadAll(resp.Body)
			
 
				+	if err != nil {
			
 
				+		return "", fmt.Errorf("read decode request: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	if resp.StatusCode >= 400 {
			
 
				+		log.Printf("llm decode error: %s", body)
			
 
				+		return "", fmt.Errorf("%s", body)
			
 
				+	}
			
 
				+
			
 
				+	var decoded DetokenizeResponse
			
 
				+	if err := json.Unmarshal(body, &decoded); err != nil {
			
 
				+		return "", fmt.Errorf("unmarshal encode response: %w", err)
			
 
				+	}
			
 
				+
			
 
				+	return decoded.Content, nil
			
 
				+}
			
 
				+
			
 
				+func (s *LlamaServer) Close() error {
			
 
				+	if s.cmd != nil {
			
 
				+		slog.Debug("stopping llama server")
			
 
				+		return s.cmd.Process.Kill()
			
 
				+	}
			
 
				+
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func parseDurationMs(ms float64) time.Duration {
			
 
				+	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
			
 
				+	if err != nil {
			
 
				+		panic(err)
			
 
				+	}
			
 
				+
			
 
				+	return dur
			
 
				+}
			
--- a/llm/status.go
+++ b/llm/status.go
@@ -0,0 +1,42 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"os"
			
 
				+)
			
 
				+
			
 
				+// StatusWriter is a writer that captures error messages from the llama runner process
			
 
				+type StatusWriter struct {
			
 
				+	LastErrMsg string
			
 
				+	out        *os.File
			
 
				+}
			
 
				+
			
 
				+func NewStatusWriter(out *os.File) *StatusWriter {
			
 
				+	return &StatusWriter{
			
 
				+		out: out,
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// TODO - regex matching to detect errors like
			
 
				+// libcublasLt.so.11: cannot open shared object file: No such file or directory
			
 
				+
			
 
				+var errorPrefixes = []string{
			
 
				+	"error:",
			
 
				+	"CUDA error",
			
 
				+	"cudaMalloc failed",
			
 
				+	"\"ERR\"",
			
 
				+}
			
 
				+
			
 
				+func (w *StatusWriter) Write(b []byte) (int, error) {
			
 
				+	var errMsg string
			
 
				+	for _, prefix := range errorPrefixes {
			
 
				+		if _, after, ok := bytes.Cut(b, []byte(prefix)); ok {
			
 
				+			errMsg = prefix + string(bytes.TrimSpace(after))
			
 
				+		}
			
 
				+	}
			
 
				+	if errMsg != "" {
			
 
				+		w.LastErrMsg = errMsg
			
 
				+	}
			
 
				+
			
 
				+	return w.out.Write(b)
			
 
				+}
			
--- a/llm/utils.go
+++ b/llm/utils.go
@@ -1,15 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"fmt"
			
 
				-	"time"
			
 
				-)
			
 
				-
			
 
				-func parseDurationMs(ms float64) time.Duration {
			
 
				-	dur, err := time.ParseDuration(fmt.Sprintf("%fms", ms))
			
 
				-	if err != nil {
			
 
				-		panic(err)
			
 
				-	}
			
 
				-
			
 
				-	return dur
			
 
				-}
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -56,12 +56,13 @@ func init() {
 
				 var loaded struct {
			
 
				 	mu sync.Mutex
			
 
				 
			
 
				-	runner llm.LLM
			
 
				+	llama *llm.LlamaServer
			
 
				 
			
 
				-	expireAt    time.Time
			
 
				 	expireTimer *time.Timer
			
 
				 
			
 
				-	*Model
			
 
				+	model      string
			
 
				+	adapters   []string
			
 
				+	projectors []string
			
 
				 	*api.Options
			
 
				 }
			
 
				 
			
@@ -69,21 +70,28 @@ var defaultSessionDuration = 5 * time.Minute
 
				 
			
 
				 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
			
 
				 func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
			
 
				-	needLoad := loaded.runner == nil || // is there a model loaded?
			
 
				-		loaded.ModelPath != model.ModelPath || // has the base model changed?
			
 
				-		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
			
 
				-		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) // have the runner options changed?
			
 
				+	ctx, cancel := context.WithTimeout(c, 10*time.Second)
			
 
				+	defer cancel()
			
 
				+
			
 
				+	needLoad := loaded.llama == nil || // is there a model loaded?
			
 
				+		loaded.model != model.ModelPath || // has the base model changed?
			
 
				+		!reflect.DeepEqual(loaded.adapters, model.AdapterPaths) || // have the adapters changed?
			
 
				+		!reflect.DeepEqual(loaded.projectors, model.ProjectorPaths) || // have the adapters changed?
			
 
				+		!reflect.DeepEqual(loaded.Options.Runner, opts.Runner) || // have the runner options changed?
			
 
				+		loaded.llama.Ping(ctx) != nil
			
 
				 
			
 
				 	if needLoad {
			
 
				-		if loaded.runner != nil {
			
 
				+		if loaded.llama != nil {
			
 
				 			slog.Info("changing loaded model")
			
 
				-			loaded.runner.Close()
			
 
				-			loaded.runner = nil
			
 
				-			loaded.Model = nil
			
 
				+			loaded.llama.Close()
			
 
				+			loaded.llama = nil
			
 
				+			loaded.model = ""
			
 
				+			loaded.adapters = nil
			
 
				+			loaded.projectors = nil
			
 
				 			loaded.Options = nil
			
 
				 		}
			
 
				 
			
 
				-		llmRunner, err := llm.New(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
			
 
				+		llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
			
 
				 		if err != nil {
			
 
				 			// some older models are not compatible with newer versions of llama.cpp
			
 
				 			// show a generalized compatibility error until there is a better way to
			
@@ -95,28 +103,26 @@ func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.
 
				 			return err
			
 
				 		}
			
 
				 
			
 
				-		loaded.Model = model
			
 
				-		loaded.runner = llmRunner
			
 
				+		loaded.model = model.ModelPath
			
 
				+		loaded.adapters = model.AdapterPaths
			
 
				+		loaded.projectors = model.ProjectorPaths
			
 
				+		loaded.llama = llama
			
 
				 		loaded.Options = opts
			
 
				 	}
			
 
				 
			
 
				-	loaded.expireAt = time.Now().Add(sessionDuration)
			
 
				-
			
 
				 	if loaded.expireTimer == nil {
			
 
				 		loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
			
 
				 			loaded.mu.Lock()
			
 
				 			defer loaded.mu.Unlock()
			
 
				 
			
 
				-			if time.Now().Before(loaded.expireAt) {
			
 
				-				return
			
 
				+			if loaded.llama != nil {
			
 
				+				loaded.llama.Close()
			
 
				 			}
			
 
				 
			
 
				-			if loaded.runner != nil {
			
 
				-				loaded.runner.Close()
			
 
				-			}
			
 
				-
			
 
				-			loaded.runner = nil
			
 
				-			loaded.Model = nil
			
 
				+			loaded.llama = nil
			
 
				+			loaded.model = ""
			
 
				+			loaded.adapters = nil
			
 
				+			loaded.projectors = nil
			
 
				 			loaded.Options = nil
			
 
				 		})
			
 
				 	}
			
@@ -265,7 +271,7 @@ func GenerateHandler(c *gin.Context) {
 
				 
			
 
				 		sb.Reset()
			
 
				 		if req.Context != nil {
			
 
				-			prev, err := loaded.runner.Decode(c.Request.Context(), req.Context)
			
 
				+			prev, err := loaded.llama.Detokenize(c.Request.Context(), req.Context)
			
 
				 			if err != nil {
			
 
				 				c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
			
 
				 				return
			
@@ -286,9 +292,8 @@ func GenerateHandler(c *gin.Context) {
 
				 	go func() {
			
 
				 		defer close(ch)
			
 
				 
			
 
				-		fn := func(r llm.PredictResult) {
			
 
				+		fn := func(r llm.CompletionResponse) {
			
 
				 			// Update model expiration
			
 
				-			loaded.expireAt = time.Now().Add(sessionDuration)
			
 
				 			loaded.expireTimer.Reset(sessionDuration)
			
 
				 
			
 
				 			// Build up the full response
			
@@ -322,7 +327,7 @@ func GenerateHandler(c *gin.Context) {
 
				 					}
			
 
				 
			
 
				 					// TODO (jmorganca): encode() should not strip special tokens
			
 
				-					tokens, err := loaded.runner.Encode(c.Request.Context(), p)
			
 
				+					tokens, err := loaded.llama.Tokenize(c.Request.Context(), p)
			
 
				 					if err != nil {
			
 
				 						ch <- gin.H{"error": err.Error()}
			
 
				 						return
			
@@ -344,13 +349,13 @@ func GenerateHandler(c *gin.Context) {
 
				 		}
			
 
				 
			
 
				 		// Start prediction
			
 
				-		predictReq := llm.PredictOpts{
			
 
				+		req := llm.CompletionRequest{
			
 
				 			Prompt:  prompt,
			
 
				 			Format:  req.Format,
			
 
				 			Images:  images,
			
 
				 			Options: opts,
			
 
				 		}
			
 
				-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
			
 
				+		if err := loaded.llama.Completion(c.Request.Context(), req, fn); err != nil {
			
 
				 			ch <- gin.H{"error": err.Error()}
			
 
				 		}
			
 
				 	}()
			
@@ -471,7 +476,7 @@ func EmbeddingsHandler(c *gin.Context) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				-	embedding, err := loaded.runner.Embedding(c.Request.Context(), req.Prompt)
			
 
				+	embedding, err := loaded.llama.Embedding(c.Request.Context(), req.Prompt)
			
 
				 	if err != nil {
			
 
				 		slog.Info(fmt.Sprintf("embedding generation failed: %v", err))
			
 
				 		c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to generate embedding"})
			
@@ -1123,8 +1128,8 @@ func Serve(ln net.Listener) error {
 
				 	signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
			
 
				 	go func() {
			
 
				 		<-signals
			
 
				-		if loaded.runner != nil {
			
 
				-			loaded.runner.Close()
			
 
				+		if loaded.llama != nil {
			
 
				+			loaded.llama.Close()
			
 
				 		}
			
 
				 		gpu.Cleanup()
			
 
				 		os.Exit(0)
			
@@ -1196,7 +1201,7 @@ func streamResponse(c *gin.Context, ch chan any) {
 
				 // ChatPrompt builds up a prompt from a series of messages for the currently `loaded` model
			
 
				 func chatPrompt(ctx context.Context, template string, messages []api.Message, numCtx int) (string, error) {
			
 
				 	encode := func(s string) ([]int, error) {
			
 
				-		return loaded.runner.Encode(ctx, s)
			
 
				+		return loaded.llama.Tokenize(ctx, s)
			
 
				 	}
			
 
				 
			
 
				 	prompt, err := ChatPrompt(template, messages, numCtx, encode)
			
@@ -1326,9 +1331,8 @@ func ChatHandler(c *gin.Context) {
 
				 	go func() {
			
 
				 		defer close(ch)
			
 
				 
			
 
				-		fn := func(r llm.PredictResult) {
			
 
				+		fn := func(r llm.CompletionResponse) {
			
 
				 			// Update model expiration
			
 
				-			loaded.expireAt = time.Now().Add(sessionDuration)
			
 
				 			loaded.expireTimer.Reset(sessionDuration)
			
 
				 
			
 
				 			resp := api.ChatResponse{
			
@@ -1352,14 +1356,12 @@ func ChatHandler(c *gin.Context) {
 
				 			ch <- resp
			
 
				 		}
			
 
				 
			
 
				-		// Start prediction
			
 
				-		predictReq := llm.PredictOpts{
			
 
				+		if err := loaded.llama.Completion(c.Request.Context(), llm.CompletionRequest{
			
 
				 			Prompt:  prompt,
			
 
				 			Format:  req.Format,
			
 
				 			Images:  images,
			
 
				 			Options: opts,
			
 
				-		}
			
 
				-		if err := loaded.runner.Predict(c.Request.Context(), predictReq, fn); err != nil {
			
 
				+		}, fn); err != nil {
			
 
				 			ch <- gin.H{"error": err.Error()}
			
 
				 		}
			
 
				 	}()
			
--- a/server/routes_test.go
+++ b/server/routes_test.go
@@ -17,7 +17,6 @@ import (
 
				 	"github.com/stretchr/testify/assert"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/llm"
			
 
				 	"github.com/ollama/ollama/parser"
			
 
				 	"github.com/ollama/ollama/version"
			
 
				 )
			
@@ -211,7 +210,7 @@ func Test_Routes(t *testing.T) {
 
				 		},
			
 
				 	}
			
 
				 
			
 
				-	s := Server{}
			
 
				+	s := &Server{}
			
 
				 	router := s.GenerateRoutes()
			
 
				 
			
 
				 	httpSrv := httptest.NewServer(router)
			
@@ -242,27 +241,3 @@ func Test_Routes(t *testing.T) {
 
				 
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-type MockLLM struct {
			
 
				-	encoding []int
			
 
				-}
			
 
				-
			
 
				-func (llm *MockLLM) Predict(ctx context.Context, pred llm.PredictOpts, fn func(llm.PredictResult)) error {
			
 
				-	return nil
			
 
				-}
			
 
				-
			
 
				-func (llm *MockLLM) Encode(ctx context.Context, prompt string) ([]int, error) {
			
 
				-	return llm.encoding, nil
			
 
				-}
			
 
				-
			
 
				-func (llm *MockLLM) Decode(ctx context.Context, tokens []int) (string, error) {
			
 
				-	return "", nil
			
 
				-}
			
 
				-
			
 
				-func (llm *MockLLM) Embedding(ctx context.Context, input string) ([]float64, error) {
			
 
				-	return []float64{}, nil
			
 
				-}
			
 
				-
			
 
				-func (llm *MockLLM) Close() {
			
 
				-	// do nothing
			
 
				-}