1 年之前 · 7555ea44f8
--- a/Dockerfile.build
+++ b/Dockerfile.build
@@ -3,7 +3,7 @@ FROM --platform=linux/amd64 ubuntu:20.04 AS base-amd64
 
				 ARG CUDA_VERSION=11.3.1-1
			
 
				 ARG CMAKE_VERSION=3.22.1
			
 
				 # ROCm only supports amd64
			
 
				-ARG ROCM_VERSION=5.7
			
 
				+ARG ROCM_VERSION=6.0
			
 
				 
			
 
				 # Note: https://rocm.docs.amd.com/en/latest/release/user_kernel_space_compat_matrix.html
			
 
				 RUN apt-get update && \
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -65,7 +65,7 @@ func GetGPUInfo() GpuInfo {
 
				 	}
			
 
				 
			
 
				 	var memInfo C.mem_info_t
			
 
				-	resp := GpuInfo{"", 0, 0}
			
 
				+	resp := GpuInfo{"", "", 0, 0}
			
 
				 	if gpuHandles.cuda != nil {
			
 
				 		C.cuda_check_vram(*gpuHandles.cuda, &memInfo)
			
 
				 		if memInfo.err != nil {
			
@@ -73,6 +73,7 @@ func GetGPUInfo() GpuInfo {
 
				 			C.free(unsafe.Pointer(memInfo.err))
			
 
				 		} else {
			
 
				 			resp.Driver = "CUDA"
			
 
				+			resp.Library = "cuda_server"
			
 
				 		}
			
 
				 	} else if gpuHandles.rocm != nil {
			
 
				 		C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
			
@@ -81,11 +82,14 @@ func GetGPUInfo() GpuInfo {
 
				 			C.free(unsafe.Pointer(memInfo.err))
			
 
				 		} else {
			
 
				 			resp.Driver = "ROCM"
			
 
				+			resp.Library = "rocm_server"
			
 
				 		}
			
 
				 	}
			
 
				 	if resp.Driver == "" {
			
 
				 		C.cpu_check_ram(&memInfo)
			
 
				 		resp.Driver = "CPU"
			
 
				+		// In the future we may offer multiple CPU variants to tune CPU features
			
 
				+		resp.Library = "default"
			
 
				 	}
			
 
				 	if memInfo.err != nil {
			
 
				 		log.Printf("error looking up CPU memory: %s", C.GoString(memInfo.err))
			
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -21,6 +21,7 @@ func GetGPUInfo() GpuInfo {
 
				 
			
 
				 	return GpuInfo{
			
 
				 		Driver:      "METAL",
			
 
				+		Library:     "default",
			
 
				 		TotalMemory: 0,
			
 
				 		FreeMemory:  0,
			
 
				 	}
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -3,6 +3,7 @@ package gpu
 
				 // Beginning of an `ollama info` command
			
 
				 type GpuInfo struct {
			
 
				 	Driver      string `json:"driver,omitempty"`
			
 
				+	Library     string `json:"library,omitempty"`
			
 
				 	TotalMemory uint64 `json:"total_memory,omitempty"`
			
 
				 	FreeMemory  uint64 `json:"free_memory,omitempty"`
			
 
				 
			
--- a/llm/dynamic_shim.c
+++ b/llm/dynamic_shim.c
@@ -1,4 +1,4 @@
 
				-#include "rocm_shim.h"
			
 
				+#include "dynamic_shim.h"
			
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <string.h>
			
@@ -28,8 +28,8 @@ inline static char *LOAD_ERR() {
 
				 #define UNLOAD_LIBRARY(handle) dlclose(handle)
			
 
				 #endif
			
 
				 
			
 
				-void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
			
 
				-                    ext_server_resp_t *err) {
			
 
				+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				+                       ext_server_resp_t *err) {
			
 
				   int i = 0;
			
 
				   struct lookup {
			
 
				     char *s;
			
@@ -57,11 +57,8 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
 
				   s->handle = LOAD_LIBRARY(libPath, RTLD_NOW);
			
 
				   if (!s->handle) {
			
 
				     err->id = -1;
			
 
				-    snprintf(
			
 
				-        err->msg, err->msg_len,
			
 
				-        "Unable to load rocm server library: %s (If you have a Radeon card, "
			
 
				-        "did you install the ROCM libraries?)",
			
 
				-        LOAD_ERR());
			
 
				+    snprintf(err->msg, err->msg_len,
			
 
				+             "Unable to load dynamic server library: %s", LOAD_ERR());
			
 
				     return;
			
 
				   }
			
 
				 
			
@@ -77,64 +74,63 @@ void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
 
				   }
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_init(struct rocm_llama_server s,
			
 
				-                                        ext_server_params_t *sparams,
			
 
				-                                        ext_server_resp_t *err) {
			
 
				+inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
			
 
				+                                           ext_server_params_t *sparams,
			
 
				+                                           ext_server_resp_t *err) {
			
 
				   s.llama_server_init(sparams, err);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_start(struct rocm_llama_server s) {
			
 
				+inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
			
 
				   s.llama_server_start();
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_stop(struct rocm_llama_server s) {
			
 
				+inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
			
 
				   s.llama_server_stop();
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_completion(struct rocm_llama_server s,
			
 
				-                                              const char *json_req,
			
 
				-                                              ext_server_resp_t *resp) {
			
 
				+inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
			
 
				+                                                 const char *json_req,
			
 
				+                                                 ext_server_resp_t *resp) {
			
 
				   s.llama_server_completion(json_req, resp);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_completion_next_result(
			
 
				-    struct rocm_llama_server s, const int task_id,
			
 
				+inline void dynamic_shim_llama_server_completion_next_result(
			
 
				+    struct dynamic_llama_server s, const int task_id,
			
 
				     ext_server_task_result_t *result) {
			
 
				   s.llama_server_completion_next_result(task_id, result);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
			
 
				-                                                     const int task_id,
			
 
				-                                                     ext_server_resp_t *err) {
			
 
				+inline void dynamic_shim_llama_server_completion_cancel(
			
 
				+    struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
			
 
				   s.llama_server_completion_cancel(task_id, err);
			
 
				 }
			
 
				-inline void rocm_shim_llama_server_release_task_result(
			
 
				-    struct rocm_llama_server s, ext_server_task_result_t *result) {
			
 
				+inline void dynamic_shim_llama_server_release_task_result(
			
 
				+    struct dynamic_llama_server s, ext_server_task_result_t *result) {
			
 
				   s.llama_server_release_task_result(result);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
			
 
				-                                            const char *json_req,
			
 
				-                                            char **json_resp,
			
 
				-                                            ext_server_resp_t *err) {
			
 
				+inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				+                                               const char *json_req,
			
 
				+                                               char **json_resp,
			
 
				+                                               ext_server_resp_t *err) {
			
 
				   s.llama_server_tokenize(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
			
 
				-                                              const char *json_req,
			
 
				-                                              char **json_resp,
			
 
				-                                              ext_server_resp_t *err) {
			
 
				+inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				+                                                 const char *json_req,
			
 
				+                                                 char **json_resp,
			
 
				+                                                 ext_server_resp_t *err) {
			
 
				   s.llama_server_detokenize(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
			
 
				-                                             const char *json_req,
			
 
				-                                             char **json_resp,
			
 
				-                                             ext_server_resp_t *err) {
			
 
				+inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
			
 
				+                                                const char *json_req,
			
 
				+                                                char **json_resp,
			
 
				+                                                ext_server_resp_t *err) {
			
 
				   s.llama_server_embedding(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
			
 
				-                                                     char **json_resp) {
			
 
				+inline void dynamic_shim_llama_server_release_json_resp(
			
 
				+    struct dynamic_llama_server s, char **json_resp) {
			
 
				   s.llama_server_release_json_resp(json_resp);
			
 
				 }
			
--- a/llm/dynamic_shim.h
+++ b/llm/dynamic_shim.h
@@ -0,0 +1,74 @@
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include "server.h"
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+struct dynamic_llama_server {
			
 
				+  void *handle;
			
 
				+  void (*llama_server_init)(ext_server_params_t *sparams,
			
 
				+                            ext_server_resp_t *err);
			
 
				+  void (*llama_server_start)();
			
 
				+  void (*llama_server_stop)();
			
 
				+  void (*llama_server_completion)(const char *json_req,
			
 
				+                                  ext_server_resp_t *resp);
			
 
				+  void (*llama_server_completion_next_result)(const int task_id,
			
 
				+                                              ext_server_task_result_t *result);
			
 
				+  void (*llama_server_completion_cancel)(const int task_id,
			
 
				+                                         ext_server_resp_t *err);
			
 
				+  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
			
 
				+  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
			
 
				+                                ext_server_resp_t *err);
			
 
				+  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
			
 
				+                                  ext_server_resp_t *err);
			
 
				+  void (*llama_server_embedding)(const char *json_req, char **json_resp,
			
 
				+                                 ext_server_resp_t *err);
			
 
				+  void (*llama_server_release_json_resp)(char **json_resp);
			
 
				+};
			
 
				+
			
 
				+void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				+                       ext_server_resp_t *err);
			
 
				+
			
 
				+// No good way to call C function pointers from Go so inline the indirection
			
 
				+void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
			
 
				+                                    ext_server_params_t *sparams,
			
 
				+                                    ext_server_resp_t *err);
			
 
				+
			
 
				+void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
			
 
				+
			
 
				+void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
			
 
				+
			
 
				+void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
			
 
				+                                          const char *json_req,
			
 
				+                                          ext_server_resp_t *resp);
			
 
				+
			
 
				+void dynamic_shim_llama_server_completion_next_result(
			
 
				+    struct dynamic_llama_server s, const int task_id,
			
 
				+    ext_server_task_result_t *result);
			
 
				+
			
 
				+void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
			
 
				+                                                 const int task_id,
			
 
				+                                                 ext_server_resp_t *err);
			
 
				+
			
 
				+void dynamic_shim_llama_server_release_task_result(
			
 
				+    struct dynamic_llama_server s, ext_server_task_result_t *result);
			
 
				+
			
 
				+void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				+                                        const char *json_req, char **json_resp,
			
 
				+                                        ext_server_resp_t *err);
			
 
				+
			
 
				+void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				+                                          const char *json_req,
			
 
				+                                          char **json_resp,
			
 
				+                                          ext_server_resp_t *err);
			
 
				+
			
 
				+void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
			
 
				+                                         const char *json_req, char **json_resp,
			
 
				+                                         ext_server_resp_t *err);
			
 
				+void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
			
 
				+                                                 char **json_resp);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -17,7 +17,10 @@ package llm
 
				 #cgo linux CFLAGS: -D_GNU_SOURCE
			
 
				 #cgo linux windows CFLAGS: -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_USE_CUBLAS
			
 
				 #cgo linux LDFLAGS: -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/local/cuda/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib/stubs
			
 
				-#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cuda/libollama.a
			
 
				+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/examples/server/libext_server.a
			
 
				+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/common/libcommon.a
			
 
				+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libllama.a
			
 
				+#cgo linux LDFLAGS: ${SRCDIR}/llama.cpp/gguf/build/cpu/libggml_static.a
			
 
				 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
			
 
				 #cgo windows LDFLAGS: -L${SRCDIR}/llama.cpp/gguf/build/wincuda/dist/bin
			
 
				 #cgo windows LDFLAGS: -lext_server_shared -lpthread
			
@@ -121,7 +124,7 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
 
				 	C.llama_server_release_json_resp(json_resp)
			
 
				 }
			
 
				 
			
 
				-func newLlamaExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				 	server := &llamaExtServer{opts}
			
 
				 	return newExtServer(server, model, adapters, projectors, numLayers, opts)
			
 
				 }
			
--- a/llm/llama.cpp/gen_common.sh
+++ b/llm/llama.cpp/gen_common.sh
@@ -6,7 +6,7 @@ init_vars() {
 
				     CMAKE_DEFS="-DLLAMA_ACCELERATE=on"
			
 
				     # TODO - LLAMA_K_QUANTS is stale and needs to be mapped to newer cmake settings
			
 
				     CMAKE_TARGETS="--target ggml --target ggml_static --target llama --target build_info --target common --target ext_server --target llava_static"
			
 
				-    if echo "${CGO_CFLAGS}" | grep -- '-g' > /dev/null ; then
			
 
				+    if echo "${CGO_CFLAGS}" | grep -- '-g' >/dev/null; then
			
 
				         CMAKE_DEFS="-DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_GPROF=on ${CMAKE_DEFS}"
			
 
				     else
			
 
				         # TODO - add additional optimization flags...
			
@@ -15,7 +15,7 @@ init_vars() {
 
				 }
			
 
				 
			
 
				 git_module_setup() {
			
 
				-    if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
			
 
				+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
			
 
				         echo "Skipping submodule initialization"
			
 
				         return
			
 
				     fi
			
@@ -25,13 +25,13 @@ git_module_setup() {
 
				 }
			
 
				 
			
 
				 apply_patches() {
			
 
				-    if [ -n "${OLLAMA_SKIP_PATCHING}" ] ; then
			
 
				+    if [ -n "${OLLAMA_SKIP_PATCHING}" ]; then
			
 
				         echo "Skipping submodule patching"
			
 
				         return
			
 
				     fi
			
 
				     # Workaround git apply not handling creation well for iteration
			
 
				     rm -f gguf/examples/server/server.h
			
 
				-    for patch in ${PATCHES} ; do
			
 
				+    for patch in ${PATCHES}; do
			
 
				         git -C gguf apply ../patches/${patch}
			
 
				     done
			
 
				 }
			
@@ -39,4 +39,4 @@ apply_patches() {
 
				 build() {
			
 
				     cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
			
 
				     cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
			
 
				-}
			
 
				+}
			
--- a/llm/llama.cpp/gen_linux.sh
+++ b/llm/llama.cpp/gen_linux.sh
@@ -1,81 +1,81 @@
 
				 #!/bin/bash
			
 
				 # This script is intended to run inside the go generate
			
 
				-# working directory must be ../llm/llama.cpp
			
 
				+# working directory must be llm/llama.cpp
			
 
				+
			
 
				+# First we build our default built-in library which will be linked into the CGO
			
 
				+# binary as a normal dependency. This default build is CPU based.
			
 
				+#
			
 
				+# Then we build a CUDA dynamic library (although statically linked with the CUDA
			
 
				+# library dependencies for maximum portability)
			
 
				+#
			
 
				+# Then if we detect ROCm, we build a dynamically loaded ROCm lib.  ROCm is particularly
			
 
				+# important to be a dynamic lib even if it's the only GPU library detected because
			
 
				+# we can't redistribute the objectfiles but must rely on dynamic libraries at
			
 
				+# runtime, which could lead the server not to start if not present.
			
 
				 
			
 
				 set -ex
			
 
				 set -o pipefail
			
 
				 
			
 
				 echo "Starting linux generate script"
			
 
				-if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ] ; then
			
 
				+if [ -z "${CUDACXX}" -a -x /usr/local/cuda/bin/nvcc ]; then
			
 
				     export CUDACXX=/usr/local/cuda/bin/nvcc
			
 
				 fi
			
 
				+COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off"
			
 
				+OLLAMA_DYN_LIB_DIR="gguf/build/lib"
			
 
				+mkdir -p ${OLLAMA_DYN_LIB_DIR}
			
 
				+touch ${OLLAMA_DYN_LIB_DIR}/.generated
			
 
				 source $(dirname $0)/gen_common.sh
			
 
				 init_vars
			
 
				 git_module_setup
			
 
				 apply_patches
			
 
				-if [ -d /usr/local/cuda/lib64/ ] ; then
			
 
				-    CMAKE_DEFS="-DLLAMA_CUBLAS=on -DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-else
			
 
				-    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				-fi
			
 
				-BUILD_DIR="gguf/build/cuda"
			
 
				-LIB_DIR="${BUILD_DIR}/lib"
			
 
				-mkdir -p ../../dist/
			
 
				+
			
 
				+#
			
 
				+# CPU first for the default library
			
 
				+#
			
 
				+CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
			
 
				+BUILD_DIR="gguf/build/cpu"
			
 
				 build
			
 
				 
			
 
				-if [ -d /usr/local/cuda/lib64/ ] ; then
			
 
				-    pwd
			
 
				-    ar -M <<EOF
			
 
				-create ${BUILD_DIR}/libollama.a
			
 
				-addlib ${BUILD_DIR}/examples/server/libext_server.a
			
 
				-addlib ${BUILD_DIR}/common/libcommon.a
			
 
				-addlib ${BUILD_DIR}/libllama.a
			
 
				-addlib ${BUILD_DIR}/libggml_static.a
			
 
				-addlib /usr/local/cuda/lib64/libcudart_static.a
			
 
				-addlib /usr/local/cuda/lib64/libcublas_static.a
			
 
				-addlib /usr/local/cuda/lib64/libcublasLt_static.a
			
 
				-addlib /usr/local/cuda/lib64/libcudadevrt.a
			
 
				-addlib /usr/local/cuda/lib64/libculibos.a
			
 
				-save
			
 
				-end
			
 
				-EOF
			
 
				-else
			
 
				-    ar -M <<EOF
			
 
				-create ${BUILD_DIR}/libollama.a
			
 
				-addlib ${BUILD_DIR}/examples/server/libext_server.a
			
 
				-addlib ${BUILD_DIR}/common/libcommon.a
			
 
				-addlib ${BUILD_DIR}/libllama.a
			
 
				-addlib ${BUILD_DIR}/libggml_static.a
			
 
				-save
			
 
				-end
			
 
				-EOF
			
 
				+if [ -d /usr/local/cuda/lib64/ ]; then
			
 
				+    echo "CUDA libraries detected - building dynamic CUDA library"
			
 
				+    init_vars
			
 
				+    CMAKE_DEFS="-DLLAMA_CUBLAS=on ${COMMON_CMAKE_DEFS} ${CMAKE_DEFS}"
			
 
				+    BUILD_DIR="gguf/build/cuda"
			
 
				+    CUDA_LIB_DIR=/usr/local/cuda/lib64
			
 
				+    build
			
 
				+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/libcuda_server.so \
			
 
				+        -Wl,--whole-archive \
			
 
				+        ${BUILD_DIR}/examples/server/libext_server.a \
			
 
				+        ${BUILD_DIR}/common/libcommon.a \
			
 
				+        ${BUILD_DIR}/libllama.a \
			
 
				+        -Wl,--no-whole-archive \
			
 
				+        ${CUDA_LIB_DIR}/libcudart_static.a \
			
 
				+        ${CUDA_LIB_DIR}/libcublas_static.a \
			
 
				+        ${CUDA_LIB_DIR}/libcublasLt_static.a \
			
 
				+        ${CUDA_LIB_DIR}/libcudadevrt.a \
			
 
				+        ${CUDA_LIB_DIR}/libculibos.a \
			
 
				+        -lrt -lpthread -ldl -lstdc++ -lm
			
 
				 fi
			
 
				 
			
 
				-if [ -z "${ROCM_PATH}" ] ; then
			
 
				+if [ -z "${ROCM_PATH}" ]; then
			
 
				     # Try the default location in case it exists
			
 
				     ROCM_PATH=/opt/rocm
			
 
				 fi
			
 
				 
			
 
				-if [ -z "${CLBlast_DIR}" ] ; then
			
 
				+if [ -z "${CLBlast_DIR}" ]; then
			
 
				     # Try the default location in case it exists
			
 
				     if [ -d /usr/lib/cmake/CLBlast ]; then
			
 
				         export CLBlast_DIR=/usr/lib/cmake/CLBlast
			
 
				     fi
			
 
				 fi
			
 
				 
			
 
				-BUILD_DIR="gguf/build/rocm"
			
 
				-LIB_DIR="${BUILD_DIR}/lib"
			
 
				-mkdir -p ${LIB_DIR}
			
 
				-# Ensure we have at least one file present for the embed
			
 
				-touch ${LIB_DIR}/.generated 
			
 
				-
			
 
				-if [ -d "${ROCM_PATH}" ] ; then
			
 
				-    echo "Building ROCm"
			
 
				+if [ -d "${ROCM_PATH}" ]; then
			
 
				+    echo "ROCm libraries detected - building dynamic ROCm library"
			
 
				     init_vars
			
 
				-    CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DCMAKE_VERBOSE_MAKEFILE=on -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
			
 
				-    CMAKE_DEFS="-DLLAMA_ACCELERATE=on -DLLAMA_NATIVE=off -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off ${CMAKE_DEFS}"
			
 
				+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102' -DGPU_TARGETS='gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102'"
			
 
				+    BUILD_DIR="gguf/build/rocm"
			
 
				     build
			
 
				-    gcc -fPIC -g -shared -o ${LIB_DIR}/librocm_server.so \
			
 
				+    gcc -fPIC -g -shared -o ${OLLAMA_DYN_LIB_DIR}/librocm_server.so \
			
 
				         -Wl,--whole-archive \
			
 
				         ${BUILD_DIR}/examples/server/libext_server.a \
			
 
				         ${BUILD_DIR}/common/libcommon.a \
			
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -8,7 +8,6 @@ import (
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"io/fs"
			
 
				-	"log"
			
 
				 	"os"
			
 
				 	"os/exec"
			
 
				 	"path/filepath"
			
@@ -120,7 +119,7 @@ type ImageData struct {
 
				 var (
			
 
				 	errNvidiaSMI     = errors.New("warning: gpu support may not be enabled, check that you have installed GPU drivers: nvidia-smi command failed")
			
 
				 	errAvailableVRAM = errors.New("not enough VRAM available, falling back to CPU only")
			
 
				-	payloadMissing   = fmt.Errorf("expected payload not included in this build of ollama")
			
 
				+	payloadMissing   = fmt.Errorf("expected dynamic library payloads not included in this build of ollama")
			
 
				 )
			
 
				 
			
 
				 // StatusWriter is a writer that captures error messages from the llama runner process
			
@@ -208,41 +207,40 @@ type EmbeddingResponse struct {
 
				 	Embedding []float64 `json:"embedding"`
			
 
				 }
			
 
				 
			
 
				-func extractLib(workDir, glob string) error {
			
 
				+func extractDynamicLibs(workDir, glob string) ([]string, error) {
			
 
				 	files, err := fs.Glob(libEmbed, glob)
			
 
				 	if err != nil || len(files) == 0 {
			
 
				-		return payloadMissing
			
 
				+		return nil, payloadMissing
			
 
				 	}
			
 
				+	libs := make([]string, len(files))
			
 
				 
			
 
				-	if len(files) != 1 {
			
 
				-		// Shouldn't happen, but just use the first one we find
			
 
				-		log.Printf("WARNING: multiple payloads detected - using %s", files[0])
			
 
				-	}
			
 
				-
			
 
				-	srcFile, err := libEmbed.Open(files[0])
			
 
				-	if err != nil {
			
 
				-		return fmt.Errorf("read payload %s: %v", files[0], err)
			
 
				-	}
			
 
				-	defer srcFile.Close()
			
 
				-	if err := os.MkdirAll(workDir, 0o755); err != nil {
			
 
				-		return fmt.Errorf("create payload temp dir %s: %v", workDir, err)
			
 
				-	}
			
 
				-
			
 
				-	destFile := filepath.Join(workDir, filepath.Base(files[0]))
			
 
				-
			
 
				-	_, err = os.Stat(destFile)
			
 
				-	switch {
			
 
				-	case errors.Is(err, os.ErrNotExist):
			
 
				-		destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
			
 
				+	for i, file := range files {
			
 
				+		srcFile, err := libEmbed.Open(file)
			
 
				 		if err != nil {
			
 
				-			return fmt.Errorf("write payload %s: %v", files[0], err)
			
 
				+			return nil, fmt.Errorf("read payload %s: %v", file, err)
			
 
				 		}
			
 
				-		defer destFile.Close()
			
 
				-		if _, err := io.Copy(destFile, srcFile); err != nil {
			
 
				-			return fmt.Errorf("copy payload %s: %v", files[0], err)
			
 
				+		defer srcFile.Close()
			
 
				+		if err := os.MkdirAll(workDir, 0o755); err != nil {
			
 
				+			return nil, fmt.Errorf("create payload temp dir %s: %v", workDir, err)
			
 
				+		}
			
 
				+
			
 
				+		destFile := filepath.Join(workDir, filepath.Base(file))
			
 
				+		libs[i] = destFile
			
 
				+
			
 
				+		_, err = os.Stat(destFile)
			
 
				+		switch {
			
 
				+		case errors.Is(err, os.ErrNotExist):
			
 
				+			destFile, err := os.OpenFile(destFile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
			
 
				+			if err != nil {
			
 
				+				return nil, fmt.Errorf("write payload %s: %v", file, err)
			
 
				+			}
			
 
				+			defer destFile.Close()
			
 
				+			if _, err := io.Copy(destFile, srcFile); err != nil {
			
 
				+				return nil, fmt.Errorf("copy payload %s: %v", file, err)
			
 
				+			}
			
 
				+		case err != nil:
			
 
				+			return nil, fmt.Errorf("stat payload %s: %v", file, err)
			
 
				 		}
			
 
				-	case err != nil:
			
 
				-		return fmt.Errorf("stat payload %s: %v", files[0], err)
			
 
				 	}
			
 
				-	return nil
			
 
				+	return libs, nil
			
 
				 }
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -22,8 +22,7 @@ type LLM interface {
 
				 	Close()
			
 
				 }
			
 
				 
			
 
				-// Set to false on linux/windows if we are able to load the shim
			
 
				-var ShimPresent = false
			
 
				+var AvailableShims = map[string]string{}
			
 
				 
			
 
				 func New(workDir, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				 	if _, err := os.Stat(model); err != nil {
			
@@ -82,15 +81,23 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 	opts.RopeFrequencyBase = 0.0
			
 
				 	opts.RopeFrequencyScale = 0.0
			
 
				 	gpuInfo := gpu.GetGPUInfo()
			
 
				-	if gpuInfo.Driver == "ROCM" && ShimPresent {
			
 
				-		return newRocmShimExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
			
 
				-	} else {
			
 
				-		// Rely on the built-in CUDA/Metal based server which will fall back to CPU
			
 
				-		return newLlamaExtServer(model, adapters, projectors, ggml.NumLayers(), opts)
			
 
				-	}
			
 
				+	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
			
 
				 }
			
 
				 
			
 
				 // Give any native cgo implementations an opportunity to initialize
			
 
				 func Init(workdir string) error {
			
 
				 	return nativeInit(workdir)
			
 
				 }
			
 
				+
			
 
				+func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
			
 
				+		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
			
 
				+		if err == nil {
			
 
				+			return srv, nil
			
 
				+		}
			
 
				+		log.Printf("Failed to load dynamic library - falling back to CPU mode %s", err)
			
 
				+	}
			
 
				+
			
 
				+	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
			
 
				+
			
 
				+}
			
--- a/llm/rocm_shim.h
+++ b/llm/rocm_shim.h
@@ -1,73 +0,0 @@
 
				-#include <stdlib.h>
			
 
				-
			
 
				-#include "server.h"
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-extern "C" {
			
 
				-#endif
			
 
				-struct rocm_llama_server {
			
 
				-  void *handle;
			
 
				-  void (*llama_server_init)(ext_server_params_t *sparams,
			
 
				-                            ext_server_resp_t *err);
			
 
				-  void (*llama_server_start)();
			
 
				-  void (*llama_server_stop)();
			
 
				-  void (*llama_server_completion)(const char *json_req,
			
 
				-                                  ext_server_resp_t *resp);
			
 
				-  void (*llama_server_completion_next_result)(const int task_id,
			
 
				-                                              ext_server_task_result_t *result);
			
 
				-  void (*llama_server_completion_cancel)(const int task_id,
			
 
				-                                         ext_server_resp_t *err);
			
 
				-  void (*llama_server_release_task_result)(ext_server_task_result_t *result);
			
 
				-  void (*llama_server_tokenize)(const char *json_req, char **json_resp,
			
 
				-                                ext_server_resp_t *err);
			
 
				-  void (*llama_server_detokenize)(const char *json_req, char **json_resp,
			
 
				-                                  ext_server_resp_t *err);
			
 
				-  void (*llama_server_embedding)(const char *json_req, char **json_resp,
			
 
				-                                 ext_server_resp_t *err);
			
 
				-  void (*llama_server_release_json_resp)(char **json_resp);
			
 
				-};
			
 
				-
			
 
				-void rocm_shim_init(const char *libPath, struct rocm_llama_server *s,
			
 
				-                    ext_server_resp_t *err);
			
 
				-
			
 
				-// No good way to call C function pointers from Go so inline the indirection
			
 
				-void rocm_shim_llama_server_init(struct rocm_llama_server s,
			
 
				-                                 ext_server_params_t *sparams,
			
 
				-                                 ext_server_resp_t *err);
			
 
				-
			
 
				-void rocm_shim_llama_server_start(struct rocm_llama_server s);
			
 
				-
			
 
				-void rocm_shim_llama_server_stop(struct rocm_llama_server s);
			
 
				-
			
 
				-void rocm_shim_llama_server_completion(struct rocm_llama_server s,
			
 
				-                                       const char *json_req,
			
 
				-                                       ext_server_resp_t *resp);
			
 
				-
			
 
				-void rocm_shim_llama_server_completion_next_result(
			
 
				-    struct rocm_llama_server s, const int task_id,
			
 
				-    ext_server_task_result_t *result);
			
 
				-
			
 
				-void rocm_shim_llama_server_completion_cancel(struct rocm_llama_server s,
			
 
				-                                              const int task_id,
			
 
				-                                              ext_server_resp_t *err);
			
 
				-
			
 
				-void rocm_shim_llama_server_release_task_result(
			
 
				-    struct rocm_llama_server s, ext_server_task_result_t *result);
			
 
				-
			
 
				-void rocm_shim_llama_server_tokenize(struct rocm_llama_server s,
			
 
				-                                     const char *json_req, char **json_resp,
			
 
				-                                     ext_server_resp_t *err);
			
 
				-
			
 
				-void rocm_shim_llama_server_detokenize(struct rocm_llama_server s,
			
 
				-                                       const char *json_req, char **json_resp,
			
 
				-                                       ext_server_resp_t *err);
			
 
				-
			
 
				-void rocm_shim_llama_server_embedding(struct rocm_llama_server s,
			
 
				-                                      const char *json_req, char **json_resp,
			
 
				-                                      ext_server_resp_t *err);
			
 
				-void rocm_shim_llama_server_release_json_resp(struct rocm_llama_server s,
			
 
				-                                              char **json_resp);
			
 
				-
			
 
				-#ifdef __cplusplus
			
 
				-}
			
 
				-#endif
			
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -12,13 +12,13 @@ import (
 
				 //go:embed llama.cpp/gguf/ggml-metal.metal
			
 
				 var libEmbed embed.FS
			
 
				 
			
 
				-func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				 	// should never happen...
			
 
				-	return nil, fmt.Errorf("ROCM GPUs not supported on Mac")
			
 
				+	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
			
 
				 }
			
 
				 
			
 
				 func nativeInit(workdir string) error {
			
 
				-	err := extractLib(workdir, "llama.cpp/gguf/ggml-metal.metal")
			
 
				+	_, err := extractDynamicLibs(workdir, "llama.cpp/gguf/ggml-metal.metal")
			
 
				 	if err != nil {
			
 
				 		if err == payloadMissing {
			
 
				 			// TODO perhaps consider this a hard failure on arm macs?
			
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -5,7 +5,7 @@ package llm
 
				 /*
			
 
				 
			
 
				 #include <stdlib.h>
			
 
				-#include "rocm_shim.h"
			
 
				+#include "dynamic_shim.h"
			
 
				 
			
 
				 */
			
 
				 import "C"
			
@@ -18,20 +18,20 @@ import (
 
				 	"log"
			
 
				 	"os"
			
 
				 	"path/filepath"
			
 
				-	"runtime"
			
 
				+	"strings"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
 
				 
			
 
				 	"github.com/jmorganca/ollama/api"
			
 
				 )
			
 
				 
			
 
				-//go:embed llama.cpp/gguf/build/*/lib/*
			
 
				+//go:embed llama.cpp/gguf/build/lib/*
			
 
				 var libEmbed embed.FS
			
 
				 
			
 
				 var RocmShimMissing = fmt.Errorf("ROCm shim library not included in this build of ollama. Radeon GPUs are not supported")
			
 
				 
			
 
				 type shimExtServer struct {
			
 
				-	s       C.struct_rocm_llama_server
			
 
				+	s       C.struct_dynamic_llama_server
			
 
				 	options api.Options
			
 
				 }
			
 
				 
			
@@ -40,50 +40,58 @@ var shimMutex sync.Mutex
 
				 var llm *shimExtServer
			
 
				 
			
 
				 func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_init(llm.s, sparams, err)
			
 
				+	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_start() {
			
 
				-	C.rocm_shim_llama_server_start(llm.s)
			
 
				+	C.dynamic_shim_llama_server_start(llm.s)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_stop() {
			
 
				-	C.rocm_shim_llama_server_stop(llm.s)
			
 
				+	C.dynamic_shim_llama_server_stop(llm.s)
			
 
				 }
			
 
				 
			
 
				 func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_completion(llm.s, json_req, resp)
			
 
				+	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
			
 
				-	C.rocm_shim_llama_server_completion_next_result(llm.s, task_id, resp)
			
 
				+	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_completion_cancel(llm.s, task_id, err)
			
 
				+	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
			
 
				-	C.rocm_shim_llama_server_release_task_result(llm.s, result)
			
 
				+	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
			
 
				 }
			
 
				 
			
 
				 func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
			
 
				+	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
			
 
				+	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.rocm_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
			
 
				+	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
			
 
				 }
			
 
				 func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
			
 
				-	C.rocm_shim_llama_server_release_json_resp(llm.s, json_resp)
			
 
				+	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
			
 
				 }
			
 
				 
			
 
				-func newRocmShimExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				-	if !ShimPresent {
			
 
				-		return nil, RocmShimMissing
			
 
				+func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+	shimMutex.Lock()
			
 
				+	defer shimMutex.Unlock()
			
 
				+	libPath := C.CString(library)
			
 
				+	defer C.free(unsafe.Pointer(libPath))
			
 
				+	resp := newExtServerResp(128)
			
 
				+	defer freeExtServerResp(resp)
			
 
				+	var srv C.struct_dynamic_llama_server
			
 
				+	C.dynamic_shim_init(libPath, &srv, &resp)
			
 
				+	if resp.id < 0 {
			
 
				+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
			
 
				 	}
			
 
				-	log.Printf("Loading ROCM llm server")
			
 
				-	if llm == nil {
			
 
				-		return nil, fmt.Errorf("nativeInit wasnt called or libary load failed")
			
 
				+	llm = &shimExtServer{
			
 
				+		s:       srv,
			
 
				+		options: opts,
			
 
				 	}
			
 
				-	llm.options = opts
			
 
				+	log.Printf("Loading Dynamic Shim llm server: %s", library)
			
 
				 	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
			
 
				 }
			
 
				 
			
@@ -108,64 +116,37 @@ func (llm *shimExtServer) Close() {
 
				 }
			
 
				 
			
 
				 func nativeInit(workdir string) error {
			
 
				-	err := extractLib(workdir, "llama.cpp/gguf/build/*/lib/*rocm_server*")
			
 
				+	libs, err := extractDynamicLibs(workdir, "llama.cpp/gguf/build/lib/*server*")
			
 
				 	if err != nil {
			
 
				 		if err == payloadMissing {
			
 
				-			log.Printf("%s", RocmShimMissing)
			
 
				+			log.Printf("%s", payloadMissing)
			
 
				 			return nil
			
 
				 		}
			
 
				 		return err
			
 
				-	} else {
			
 
				-		ShimPresent = true
			
 
				+	}
			
 
				+	for _, lib := range libs {
			
 
				+		libName := strings.Split(strings.TrimPrefix(filepath.Base(lib), "lib"), ".")[0]
			
 
				+		AvailableShims[libName] = lib
			
 
				 	}
			
 
				 
			
 
				-	// Verify we have permissions - either running as root, or we have group access to the driver
			
 
				-	fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
			
 
				-	if err != nil {
			
 
				-		if errors.Is(err, fs.ErrPermission) {
			
 
				-			log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
			
 
				-			return err
			
 
				-		} else if errors.Is(err, fs.ErrNotExist) {
			
 
				-			// expected behavior without a radeon card
			
 
				-			return nil
			
 
				+	// Only check ROCm access if we have the dynamic lib loaded
			
 
				+	if _, rocmPresent := AvailableShims["rocm_server"]; rocmPresent {
			
 
				+		// Verify we have permissions - either running as root, or we have group access to the driver
			
 
				+		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
			
 
				+		if err != nil {
			
 
				+			if errors.Is(err, fs.ErrPermission) {
			
 
				+				log.Fatalf("Radeon card detected, but permissions not set up properly.  Either run ollama as root, or add you user account to the render group.")
			
 
				+				return err
			
 
				+			} else if errors.Is(err, fs.ErrNotExist) {
			
 
				+				// expected behavior without a radeon card
			
 
				+				return nil
			
 
				+			}
			
 
				+
			
 
				+			return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
			
 
				 		}
			
 
				+		fd.Close()
			
 
				 
			
 
				-		return fmt.Errorf("failed to check permission on /dev/kfd: %w", err)
			
 
				 	}
			
 
				-	fd.Close()
			
 
				 
			
 
				-	shimMutex.Lock()
			
 
				-	defer shimMutex.Unlock()
			
 
				-	if llm != nil {
			
 
				-		return nil
			
 
				-	}
			
 
				-	var libName string
			
 
				-	switch runtime.GOOS {
			
 
				-	case "darwin":
			
 
				-		// shouldn't happen
			
 
				-		return nil
			
 
				-	case "linux":
			
 
				-		libName = "librocm_server.so"
			
 
				-	case "windows":
			
 
				-		libName = "rocm_server.dll"
			
 
				-	default:
			
 
				-		// shouldn't happen
			
 
				-		return nil
			
 
				-	}
			
 
				-	libPath := C.CString(filepath.Join(workdir, libName))
			
 
				-	defer C.free(unsafe.Pointer(libPath))
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	var srv C.struct_rocm_llama_server
			
 
				-	C.rocm_shim_init(libPath, &srv, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		// TODO - consider softening this failure mode to allow fall-back to the CUDA based built-in llm
			
 
				-		//        and run against CPU
			
 
				-		return fmt.Errorf("Unable to load AMD GPU library: %s", C.GoString(resp.msg))
			
 
				-	}
			
 
				-	llm = &shimExtServer{
			
 
				-		s:       srv,
			
 
				-		options: api.DefaultOptions(),
			
 
				-	}
			
 
				 	return nil
			
 
				 }