1 year ago · 39928a42e8
--- a/go.mod
+++ b/go.mod
@@ -45,7 +45,7 @@ require (
 
				 	golang.org/x/crypto v0.14.0
			
 
				 	golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63
			
 
				 	golang.org/x/net v0.17.0 // indirect
			
 
				-	golang.org/x/sys v0.13.0 // indirect
			
 
				+	golang.org/x/sys v0.13.0
			
 
				 	golang.org/x/term v0.13.0
			
 
				 	golang.org/x/text v0.13.0 // indirect
			
 
				 	google.golang.org/protobuf v1.30.0 // indirect
			
--- a/gpu/cpu_common.go
+++ b/gpu/cpu_common.go
@@ -0,0 +1,21 @@
 
				+package gpu
			
 
				+
			
 
				+import (
			
 
				+	"log"
			
 
				+
			
 
				+	"golang.org/x/sys/cpu"
			
 
				+)
			
 
				+
			
 
				+func GetCPUVariant() string {
			
 
				+	if cpu.X86.HasAVX2 {
			
 
				+		log.Printf("CPU has AVX2")
			
 
				+		return "avx2"
			
 
				+	}
			
 
				+	if cpu.X86.HasAVX {
			
 
				+		log.Printf("CPU has AVX")
			
 
				+		return "avx"
			
 
				+	}
			
 
				+	log.Printf("CPU does not have vector extensions")
			
 
				+	// else LCD
			
 
				+	return ""
			
 
				+}
			
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -32,8 +32,15 @@ func CheckVRAM() (int64, error) {
 
				 
			
 
				 func GetGPUInfo() GpuInfo {
			
 
				 	mem, _ := getCPUMem()
			
 
				+	if runtime.GOARCH == "amd64" {
			
 
				+		return GpuInfo{
			
 
				+			Library: "default",
			
 
				+			Variant: GetCPUVariant(),
			
 
				+			memInfo: mem,
			
 
				+		}
			
 
				+	}
			
 
				 	return GpuInfo{
			
 
				-		Library: "default",
			
 
				+		Library: "metal",
			
 
				 		memInfo: mem,
			
 
				 	}
			
 
				 }
			
@@ -45,12 +52,3 @@ func getCPUMem() (memInfo, error) {
 
				 		DeviceCount: 0,
			
 
				 	}, nil
			
 
				 }
			
 
				-
			
 
				-func nativeInit() error {
			
 
				-	return nil
			
 
				-}
			
 
				-
			
 
				-func GetCPUVariant() string {
			
 
				-	// We don't yet have CPU based builds for Darwin...
			
 
				-	return ""
			
 
				-}
			
--- a/gpu/gpu_test.go
+++ b/gpu/gpu_test.go
@@ -9,7 +9,7 @@ import (
 
				 
			
 
				 func TestBasicGetGPUInfo(t *testing.T) {
			
 
				 	info := GetGPUInfo()
			
 
				-	assert.Contains(t, "cuda rocm cpu default", info.Library)
			
 
				+	assert.Contains(t, "cuda rocm cpu metal", info.Library)
			
 
				 
			
 
				 	switch runtime.GOOS {
			
 
				 	case "darwin":
			
--- a/llm/dyn_ext_server.c
+++ b/llm/dyn_ext_server.c
@@ -1,4 +1,4 @@
 
				-#include "dynamic_shim.h"
			
 
				+#include "dyn_ext_server.h"
			
 
				 
			
 
				 #include <stdio.h>
			
 
				 #include <string.h>
			
@@ -33,7 +33,7 @@ inline char *LOAD_ERR() {
 
				 #define UNLOAD_LIBRARY(handle) dlclose(handle)
			
 
				 #endif
			
 
				 
			
 
				-void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				                        ext_server_resp_t *err) {
			
 
				   int i = 0;
			
 
				   struct lookup {
			
@@ -83,63 +83,63 @@ void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
 
				   }
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
			
 
				+inline void dyn_llama_server_init(struct dynamic_llama_server s,
			
 
				                                            ext_server_params_t *sparams,
			
 
				                                            ext_server_resp_t *err) {
			
 
				   s.llama_server_init(sparams, err);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_start(struct dynamic_llama_server s) {
			
 
				+inline void dyn_llama_server_start(struct dynamic_llama_server s) {
			
 
				   s.llama_server_start();
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_stop(struct dynamic_llama_server s) {
			
 
				+inline void dyn_llama_server_stop(struct dynamic_llama_server s) {
			
 
				   s.llama_server_stop();
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
			
 
				+inline void dyn_llama_server_completion(struct dynamic_llama_server s,
			
 
				                                                  const char *json_req,
			
 
				                                                  ext_server_resp_t *resp) {
			
 
				   s.llama_server_completion(json_req, resp);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_completion_next_result(
			
 
				+inline void dyn_llama_server_completion_next_result(
			
 
				     struct dynamic_llama_server s, const int task_id,
			
 
				     ext_server_task_result_t *result) {
			
 
				   s.llama_server_completion_next_result(task_id, result);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_completion_cancel(
			
 
				+inline void dyn_llama_server_completion_cancel(
			
 
				     struct dynamic_llama_server s, const int task_id, ext_server_resp_t *err) {
			
 
				   s.llama_server_completion_cancel(task_id, err);
			
 
				 }
			
 
				-inline void dynamic_shim_llama_server_release_task_result(
			
 
				+inline void dyn_llama_server_release_task_result(
			
 
				     struct dynamic_llama_server s, ext_server_task_result_t *result) {
			
 
				   s.llama_server_release_task_result(result);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				+inline void dyn_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				                                                const char *json_req,
			
 
				                                                char **json_resp,
			
 
				                                                ext_server_resp_t *err) {
			
 
				   s.llama_server_tokenize(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				+inline void dyn_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				                                                  const char *json_req,
			
 
				                                                  char **json_resp,
			
 
				                                                  ext_server_resp_t *err) {
			
 
				   s.llama_server_detokenize(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
			
 
				+inline void dyn_llama_server_embedding(struct dynamic_llama_server s,
			
 
				                                                 const char *json_req,
			
 
				                                                 char **json_resp,
			
 
				                                                 ext_server_resp_t *err) {
			
 
				   s.llama_server_embedding(json_req, json_resp, err);
			
 
				 }
			
 
				 
			
 
				-inline void dynamic_shim_llama_server_release_json_resp(
			
 
				+inline void dyn_llama_server_release_json_resp(
			
 
				     struct dynamic_llama_server s, char **json_resp) {
			
 
				   s.llama_server_release_json_resp(json_resp);
			
 
				 }
			
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -10,25 +10,25 @@ package llm
 
				 #cgo darwin CPPFLAGS: -DGGML_USE_METAL -DGGML_METAL_NDEBUG
			
 
				 #cgo darwin LDFLAGS: -lc++ -framework Accelerate
			
 
				 #cgo darwin LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
			
 
				-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libcommon.a
			
 
				-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libext_server.a
			
 
				-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libllama.a
			
 
				-#cgo darwin LDFLAGS: ${SRCDIR}/llama.cpp/build/darwin/metal/lib/libggml_static.a
			
 
				 #cgo linux CFLAGS: -D_GNU_SOURCE
			
 
				 #cgo linux LDFLAGS: -lrt -ldl -lstdc++ -lm
			
 
				 #cgo linux windows LDFLAGS: -lpthread
			
 
				 
			
 
				 #include <stdlib.h>
			
 
				-#include "ext_server.h"
			
 
				+#include "dyn_ext_server.h"
			
 
				 
			
 
				 */
			
 
				 import "C"
			
 
				+
			
 
				 import (
			
 
				 	"bytes"
			
 
				 	"context"
			
 
				 	"encoding/json"
			
 
				 	"fmt"
			
 
				 	"log"
			
 
				+	"os"
			
 
				+	"path/filepath"
			
 
				+	"runtime"
			
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"time"
			
@@ -37,21 +37,9 @@ import (
 
				 	"github.com/jmorganca/ollama/api"
			
 
				 )
			
 
				 
			
 
				-// TODO switch Linux to always be dynamic
			
 
				-// If that works out, then look at the impact of doing the same for Mac
			
 
				-type extServer interface {
			
 
				-	LLM
			
 
				-	llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t)
			
 
				-	llama_server_start()
			
 
				-	llama_server_stop()
			
 
				-	llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t)
			
 
				-	llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t)
			
 
				-	llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t)
			
 
				-	llama_server_release_task_result(result *C.ext_server_task_result_t)
			
 
				-	llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
			
 
				-	llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
			
 
				-	llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t)
			
 
				-	llama_server_release_json_resp(json_resp **C.char)
			
 
				+type dynExtServer struct {
			
 
				+	s       C.struct_dynamic_llama_server
			
 
				+	options api.Options
			
 
				 }
			
 
				 
			
 
				 // Note: current implementation does not support concurrent instantiations
			
@@ -76,11 +64,30 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 
				 	return fmt.Errorf(C.GoString(resp.msg))
			
 
				 }
			
 
				 
			
 
				-func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				+// Note: current implementation does not support concurrent instantiations
			
 
				+var llm *dynExtServer
			
 
				+
			
 
				+func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				 	if !mutex.TryLock() {
			
 
				 		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
			
 
				 		mutex.Lock()
			
 
				 	}
			
 
				+	updatePath(filepath.Dir(library))
			
 
				+	libPath := C.CString(library)
			
 
				+	defer C.free(unsafe.Pointer(libPath))
			
 
				+	resp := newExtServerResp(128)
			
 
				+	defer freeExtServerResp(resp)
			
 
				+	var srv C.struct_dynamic_llama_server
			
 
				+	C.dyn_init(libPath, &srv, &resp)
			
 
				+	if resp.id < 0 {
			
 
				+		mutex.Unlock()
			
 
				+		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
			
 
				+	}
			
 
				+	llm = &dynExtServer{
			
 
				+		s:       srv,
			
 
				+		options: opts,
			
 
				+	}
			
 
				+	log.Printf("Loading Dynamic llm server: %s", library)
			
 
				 
			
 
				 	var sparams C.ext_server_params_t
			
 
				 	sparams.model = C.CString(model)
			
@@ -129,20 +136,20 @@ func newExtServer(server extServer, model string, adapters, projectors []string,
 
				 
			
 
				 	sparams.n_threads = C.uint(opts.NumThread)
			
 
				 
			
 
				-	log.Printf("Initializing internal llama server")
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	server.llama_server_init(&sparams, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return nil, extServerResponseToErr(resp)
			
 
				+	log.Printf("Initializing llama server")
			
 
				+	initResp := newExtServerResp(128)
			
 
				+	defer freeExtServerResp(initResp)
			
 
				+	C.dyn_llama_server_init(llm.s, &sparams, &initResp)
			
 
				+	if initResp.id < 0 {
			
 
				+		return nil, extServerResponseToErr(initResp)
			
 
				 	}
			
 
				 
			
 
				-	log.Printf("Starting internal llama main loop")
			
 
				-	server.llama_server_start()
			
 
				-	return server, nil
			
 
				+	log.Printf("Starting llama main loop")
			
 
				+	C.dyn_llama_server_start(llm.s)
			
 
				+	return llm, nil
			
 
				 }
			
 
				 
			
 
				-func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(PredictResult)) error {
			
 
				+func (llm *dynExtServer) Predict(ctx context.Context, predict PredictOpts, fn func(PredictResult)) error {
			
 
				 	resp := newExtServerResp(128)
			
 
				 	defer freeExtServerResp(resp)
			
 
				 	var imageData []ImageData
			
@@ -200,7 +207,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 
				 		req := C.CString(buffer.String())
			
 
				 		defer C.free(unsafe.Pointer(req))
			
 
				 
			
 
				-		llm.llama_server_completion(req, &resp)
			
 
				+		C.dyn_llama_server_completion(llm.s, req, &resp)
			
 
				 		if resp.id < 0 {
			
 
				 			return extServerResponseToErr(resp)
			
 
				 		}
			
@@ -211,7 +218,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 
				 			select {
			
 
				 			case <-ctx.Done():
			
 
				 				// This handles the request cancellation
			
 
				-				llm.llama_server_completion_cancel(resp.id, &resp)
			
 
				+				C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
			
 
				 				if resp.id < 0 {
			
 
				 					return extServerResponseToErr(resp)
			
 
				 				} else {
			
@@ -219,13 +226,13 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 
				 				}
			
 
				 			default:
			
 
				 				var result C.ext_server_task_result_t
			
 
				-				llm.llama_server_completion_next_result(resp.id, &result)
			
 
				+				C.dyn_llama_server_completion_next_result(llm.s, resp.id, &result)
			
 
				 				json_resp := C.GoString(result.json_resp)
			
 
				-				llm.llama_server_release_task_result(&result)
			
 
				+				C.dyn_llama_server_release_task_result(llm.s, &result)
			
 
				 
			
 
				 				var p prediction
			
 
				 				if err := json.Unmarshal([]byte(json_resp), &p); err != nil {
			
 
				-					llm.llama_server_completion_cancel(resp.id, &resp)
			
 
				+					C.dyn_llama_server_completion_cancel(llm.s, resp.id, &resp)
			
 
				 					if resp.id < 0 {
			
 
				 						return fmt.Errorf("error unmarshaling llm prediction response: %w and cancel %s", err, C.GoString(resp.msg))
			
 
				 					} else {
			
@@ -266,7 +273,7 @@ func predict(ctx context.Context, llm extServer, predict PredictOpts, fn func(Pr
 
				 	return fmt.Errorf("max retries exceeded")
			
 
				 }
			
 
				 
			
 
				-func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
			
 
				+func (llm *dynExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
			
 
				 	data, err := json.Marshal(TokenizeRequest{Content: prompt})
			
 
				 	if err != nil {
			
 
				 		return nil, fmt.Errorf("marshaling encode data: %w", err)
			
@@ -276,11 +283,11 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 
				 	var json_resp *C.char
			
 
				 	resp := newExtServerResp(128)
			
 
				 	defer freeExtServerResp(resp)
			
 
				-	llm.llama_server_tokenize(req, &json_resp, &resp)
			
 
				+	C.dyn_llama_server_tokenize(llm.s, req, &json_resp, &resp)
			
 
				 	if resp.id < 0 {
			
 
				 		return nil, extServerResponseToErr(resp)
			
 
				 	}
			
 
				-	defer llm.llama_server_release_json_resp(&json_resp)
			
 
				+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				 
			
 
				 	var encoded TokenizeResponse
			
 
				 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &encoded); err2 != nil {
			
@@ -290,7 +297,7 @@ func encode(llm extServer, ctx context.Context, prompt string) ([]int, error) {
 
				 	return encoded.Tokens, err
			
 
				 }
			
 
				 
			
 
				-func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
			
 
				+func (llm *dynExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
			
 
				 	if len(tokens) == 0 {
			
 
				 		return "", nil
			
 
				 	}
			
@@ -304,11 +311,11 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 
				 	var json_resp *C.char
			
 
				 	resp := newExtServerResp(128)
			
 
				 	defer freeExtServerResp(resp)
			
 
				-	llm.llama_server_detokenize(req, &json_resp, &resp)
			
 
				+	C.dyn_llama_server_detokenize(llm.s, req, &json_resp, &resp)
			
 
				 	if resp.id < 0 {
			
 
				 		return "", extServerResponseToErr(resp)
			
 
				 	}
			
 
				-	defer llm.llama_server_release_json_resp(&json_resp)
			
 
				+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				 
			
 
				 	var decoded DetokenizeResponse
			
 
				 	if err2 := json.Unmarshal([]byte(C.GoString(json_resp)), &decoded); err2 != nil {
			
@@ -318,7 +325,7 @@ func decode(llm extServer, ctx context.Context, tokens []int) (string, error) {
 
				 	return decoded.Content, err
			
 
				 }
			
 
				 
			
 
				-func embedding(llm extServer, ctx context.Context, input string) ([]float64, error) {
			
 
				+func (llm *dynExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
			
 
				 	data, err := json.Marshal(TokenizeRequest{Content: input})
			
 
				 	if err != nil {
			
 
				 		return nil, fmt.Errorf("error marshaling embed data: %w", err)
			
@@ -329,11 +336,11 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
 
				 	var json_resp *C.char
			
 
				 	resp := newExtServerResp(128)
			
 
				 	defer freeExtServerResp(resp)
			
 
				-	llm.llama_server_embedding(req, &json_resp, &resp)
			
 
				+	C.dyn_llama_server_embedding(llm.s, req, &json_resp, &resp)
			
 
				 	if resp.id < 0 {
			
 
				 		return nil, extServerResponseToErr(resp)
			
 
				 	}
			
 
				-	defer llm.llama_server_release_json_resp(&json_resp)
			
 
				+	defer C.dyn_llama_server_release_json_resp(llm.s, &json_resp)
			
 
				 
			
 
				 	var embedding EmbeddingResponse
			
 
				 	if err := json.Unmarshal([]byte(C.GoString(json_resp)), &embedding); err != nil {
			
@@ -343,7 +350,38 @@ func embedding(llm extServer, ctx context.Context, input string) ([]float64, err
 
				 	return embedding.Embedding, nil
			
 
				 }
			
 
				 
			
 
				-func close(llm extServer) {
			
 
				-	llm.llama_server_stop()
			
 
				+func (llm *dynExtServer) Close() {
			
 
				+	C.dyn_llama_server_stop(llm.s)
			
 
				 	mutex.Unlock()
			
 
				 }
			
 
				+
			
 
				+func updatePath(dir string) {
			
 
				+	if runtime.GOOS == "windows" {
			
 
				+		tmpDir := filepath.Dir(dir)
			
 
				+		pathComponents := strings.Split(os.Getenv("PATH"), ";")
			
 
				+		i := 0
			
 
				+		for _, comp := range pathComponents {
			
 
				+			if strings.EqualFold(comp, dir) {
			
 
				+				return
			
 
				+			}
			
 
				+			// Remove any other prior paths to our temp dir
			
 
				+			if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
			
 
				+				pathComponents[i] = comp
			
 
				+				i++
			
 
				+			}
			
 
				+		}
			
 
				+		newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
			
 
				+		log.Printf("Updating PATH to %s", newPath)
			
 
				+		os.Setenv("PATH", newPath)
			
 
				+	} else {
			
 
				+		pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
			
 
				+		for _, comp := range pathComponents {
			
 
				+			if comp == dir {
			
 
				+				return
			
 
				+			}
			
 
				+		}
			
 
				+		newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
			
 
				+		log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
			
 
				+		os.Setenv("LD_LIBRARY_PATH", newPath)
			
 
				+	}
			
 
				+}
			
--- a/llm/dyn_ext_server.h
+++ b/llm/dyn_ext_server.h
@@ -27,46 +27,46 @@ struct dynamic_llama_server {
 
				   void (*llama_server_release_json_resp)(char **json_resp);
			
 
				 };
			
 
				 
			
 
				-void dynamic_shim_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				+void dyn_init(const char *libPath, struct dynamic_llama_server *s,
			
 
				                        ext_server_resp_t *err);
			
 
				 
			
 
				 // No good way to call C function pointers from Go so inline the indirection
			
 
				-void dynamic_shim_llama_server_init(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_init(struct dynamic_llama_server s,
			
 
				                                     ext_server_params_t *sparams,
			
 
				                                     ext_server_resp_t *err);
			
 
				 
			
 
				-void dynamic_shim_llama_server_start(struct dynamic_llama_server s);
			
 
				+void dyn_llama_server_start(struct dynamic_llama_server s);
			
 
				 
			
 
				-void dynamic_shim_llama_server_stop(struct dynamic_llama_server s);
			
 
				+void dyn_llama_server_stop(struct dynamic_llama_server s);
			
 
				 
			
 
				-void dynamic_shim_llama_server_completion(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_completion(struct dynamic_llama_server s,
			
 
				                                           const char *json_req,
			
 
				                                           ext_server_resp_t *resp);
			
 
				 
			
 
				-void dynamic_shim_llama_server_completion_next_result(
			
 
				+void dyn_llama_server_completion_next_result(
			
 
				     struct dynamic_llama_server s, const int task_id,
			
 
				     ext_server_task_result_t *result);
			
 
				 
			
 
				-void dynamic_shim_llama_server_completion_cancel(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_completion_cancel(struct dynamic_llama_server s,
			
 
				                                                  const int task_id,
			
 
				                                                  ext_server_resp_t *err);
			
 
				 
			
 
				-void dynamic_shim_llama_server_release_task_result(
			
 
				+void dyn_llama_server_release_task_result(
			
 
				     struct dynamic_llama_server s, ext_server_task_result_t *result);
			
 
				 
			
 
				-void dynamic_shim_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_tokenize(struct dynamic_llama_server s,
			
 
				                                         const char *json_req, char **json_resp,
			
 
				                                         ext_server_resp_t *err);
			
 
				 
			
 
				-void dynamic_shim_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_detokenize(struct dynamic_llama_server s,
			
 
				                                           const char *json_req,
			
 
				                                           char **json_resp,
			
 
				                                           ext_server_resp_t *err);
			
 
				 
			
 
				-void dynamic_shim_llama_server_embedding(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_embedding(struct dynamic_llama_server s,
			
 
				                                          const char *json_req, char **json_resp,
			
 
				                                          ext_server_resp_t *err);
			
 
				-void dynamic_shim_llama_server_release_json_resp(struct dynamic_llama_server s,
			
 
				+void dyn_llama_server_release_json_resp(struct dynamic_llama_server s,
			
 
				                                                  char **json_resp);
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/llm/ext_server.go
+++ b/llm/ext_server.go
@@ -1,17 +0,0 @@
 
				-//go:build !darwin
			
 
				-
			
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"fmt"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				-)
			
 
				-
			
 
				-func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				-	// On windows and linux we always load the llama.cpp libraries dynamically to avoid startup DLL dependencies
			
 
				-	// This ensures we can update the PATH at runtime to get everything loaded
			
 
				-
			
 
				-	// This should never happen as we'll always try to load one or more cpu dynamic libaries before hitting default
			
 
				-	return nil, fmt.Errorf("no available default llm library")
			
 
				-}
			
--- a/llm/ext_server_default.go
+++ b/llm/ext_server_default.go
@@ -1,82 +0,0 @@
 
				-//go:build darwin
			
 
				-
			
 
				-package llm
			
 
				-
			
 
				-/*
			
 
				-#include <stdlib.h>
			
 
				-#include "ext_server.h"
			
 
				-
			
 
				-*/
			
 
				-import "C"
			
 
				-import (
			
 
				-	"context"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				-)
			
 
				-
			
 
				-// TODO - explore shifting Darwin to a dynamic loading pattern for consistency with Linux and Windows
			
 
				-
			
 
				-type llamaExtServer struct {
			
 
				-	api.Options
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
			
 
				-	C.llama_server_init(sparams, err)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_start() {
			
 
				-	C.llama_server_start()
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_stop() {
			
 
				-	C.llama_server_stop()
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
			
 
				-	C.llama_server_completion(json_req, resp)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
			
 
				-	C.llama_server_completion_next_result(task_id, resp)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
			
 
				-	C.llama_server_completion_cancel(task_id, err)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
			
 
				-	C.llama_server_release_task_result(result)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.llama_server_tokenize(json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.llama_server_detokenize(json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.llama_server_embedding(json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
			
 
				-	C.llama_server_release_json_resp(json_resp)
			
 
				-}
			
 
				-
			
 
				-func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				-	server := &llamaExtServer{opts}
			
 
				-	return newExtServer(server, model, adapters, projectors, opts)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
			
 
				-	return predict(ctx, llm, pred, fn)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
			
 
				-	return encode(llm, ctx, prompt)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
			
 
				-	return decode(llm, ctx, tokens)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
			
 
				-	return embedding(llm, ctx, input)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaExtServer) Close() {
			
 
				-	close(llm)
			
 
				-}
			
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -29,4 +29,16 @@ git_module_setup
 
				 apply_patches
			
 
				 build
			
 
				 install
			
 
				+gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
			
 
				+    -Wl,-force_load ${BUILD_DIR}/lib/libext_server.a \
			
 
				+    ${BUILD_DIR}/lib/libcommon.a \
			
 
				+    ${BUILD_DIR}/lib/libllama.a \
			
 
				+    ${BUILD_DIR}/lib/libggml_static.a \
			
 
				+    -lpthread -ldl -lm -lc++ \
			
 
				+    -framework Accelerate \
			
 
				+    -framework Foundation \
			
 
				+    -framework Metal \
			
 
				+    -framework MetalKit \
			
 
				+    -framework MetalPerformanceShaders
			
 
				+
			
 
				 cleanup
			
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -104,12 +104,6 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
 
				         build
			
 
				         install
			
 
				         link_server_lib
			
 
				-        gcc -fPIC -g -shared -o ${BUILD_DIR}/lib/libext_server.so \
			
 
				-            -Wl,--whole-archive \
			
 
				-            ${BUILD_DIR}/lib/libext_server.a \
			
 
				-            -Wl,--no-whole-archive \
			
 
				-            ${BUILD_DIR}/lib/libcommon.a \
			
 
				-            ${BUILD_DIR}/lib/libllama.a
			
 
				     fi
			
 
				 else
			
 
				     echo "Skipping CPU generation step as requested"
			
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -4,7 +4,7 @@ $ErrorActionPreference = "Stop"
 
				 
			
 
				 function init_vars {
			
 
				     $script:llamacppDir = "../llama.cpp"
			
 
				-    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off", "-DLLAMA_F16C=off", "-DLLAMA_FMA=off", "-DLLAMA_AVX512=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX=on", "-A","x64")
			
 
				+    $script:cmakeDefs = @("-DBUILD_SHARED_LIBS=on", "-DLLAMA_NATIVE=off",  "-A","x64")
			
 
				     $script:cmakeTargets = @("ggml", "ggml_static", "llama", "build_info", "common", "ext_server_shared", "llava_static")
			
 
				     if ($env:CGO_CFLAGS -contains "-g") {
			
 
				         $script:cmakeDefs += @("-DCMAKE_VERBOSE_MAKEFILE=on", "-DLLAMA_SERVER_VERBOSE=on")
			
@@ -63,16 +63,36 @@ init_vars
 
				 git_module_setup
			
 
				 apply_patches
			
 
				 
			
 
				-# first build CPU based
			
 
				+# -DLLAMA_AVX -- 2011 Intel Sandy Bridge & AMD Bulldozer
			
 
				+# -DLLAMA_F16C -- 2012 Intel Ivy Bridge & AMD 2011 Bulldozer (No significant improvement over just AVX)
			
 
				+# -DLLAMA_AVX2 -- 2013 Intel Haswell & 2015 AMD Excavator / 2017 AMD Zen
			
 
				+# -DLLAMA_FMA (FMA3) -- 2013 Intel Haswell & 2012 AMD Piledriver
			
 
				+
			
 
				+$script:commonCpuDefs = @("-DCMAKE_POSITION_INDEPENDENT_CODE=on", "-DLLAMA_NATIVE=off")
			
 
				+
			
 
				+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=off", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
			
 
				 $script:buildDir="${script:llamacppDir}/build/windows/cpu"
			
 
				+write-host "Building LCD CPU"
			
 
				+build
			
 
				+install
			
 
				+
			
 
				+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=off", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=off", "-DLLAMA_F16C=off") + $script:cmakeDefs
			
 
				+$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx"
			
 
				+write-host "Building AVX CPU"
			
 
				+build
			
 
				+install
			
 
				 
			
 
				+$script:cmakeDefs = $script:commonCpuDefs + @("-DLLAMA_AVX=on", "-DLLAMA_AVX2=on", "-DLLAMA_AVX512=off", "-DLLAMA_FMA=on", "-DLLAMA_F16C=on") + $script:cmakeDefs
			
 
				+$script:buildDir="${script:llamacppDir}/build/windows/cpu_avx2"
			
 
				+write-host "Building AVX2 CPU"
			
 
				 build
			
 
				 install
			
 
				 
			
 
				 # Then build cuda as a dynamically loaded library
			
 
				+# TODO figure out how to detect cuda version
			
 
				 init_vars
			
 
				 $script:buildDir="${script:llamacppDir}/build/windows/cuda"
			
 
				-$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON")
			
 
				+$script:cmakeDefs += @("-DLLAMA_CUBLAS=ON", "-DLLAMA_AVX=on")
			
 
				 build
			
 
				 install
			
 
				 
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -138,33 +138,30 @@ func Init(workdir string) error {
 
				 	return nativeInit(workdir)
			
 
				 }
			
 
				 
			
 
				-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				-	shims := getShims(gpuInfo)
			
 
				+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				+	dynLibs := getDynLibs(gpuInfo)
			
 
				 
			
 
				 	// Check to see if the user has requested a specific library instead of auto-detecting
			
 
				 	demandLib := os.Getenv("OLLAMA_LLM_LIBRARY")
			
 
				 	if demandLib != "" {
			
 
				-		libPath := availableShims[demandLib]
			
 
				+		libPath := availableDynLibs[demandLib]
			
 
				 		if libPath == "" {
			
 
				 			log.Printf("Invalid OLLAMA_LLM_LIBRARY %s - not found", demandLib)
			
 
				 		} else {
			
 
				 			log.Printf("Loading OLLAMA_LLM_LIBRARY=%s", demandLib)
			
 
				-			shims = []string{libPath}
			
 
				+			dynLibs = []string{libPath}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for _, shim := range shims {
			
 
				-		// TODO - only applies on Darwin (switch to fully dynamic there too...)
			
 
				-		if shim == "default" {
			
 
				-			break
			
 
				-		}
			
 
				-		srv, err := newDynamicShimExtServer(shim, model, adapters, projectors, opts)
			
 
				+	err2 := fmt.Errorf("unable to locate suitable llm library")
			
 
				+	for _, dynLib := range dynLibs {
			
 
				+		srv, err := newDynExtServer(dynLib, model, adapters, projectors, opts)
			
 
				 		if err == nil {
			
 
				 			return srv, nil
			
 
				 		}
			
 
				-		log.Printf("Failed to load dynamic library %s  %s", shim, err)
			
 
				+		log.Printf("Failed to load dynamic library %s  %s", dynLib, err)
			
 
				+		err2 = err
			
 
				 	}
			
 
				 
			
 
				-	return newDefaultExtServer(model, adapters, projectors, opts)
			
 
				-
			
 
				+	return nil, err2
			
 
				 }
			
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -18,42 +18,42 @@ import (
 
				 // Libraries names may contain an optional variant separated by '_'
			
 
				 // For example, "rocm_v6" and "rocm_v5" or "cpu" and "cpu_avx2"
			
 
				 // Any library without a variant is the lowest common denominator
			
 
				-var availableShims = map[string]string{}
			
 
				+var availableDynLibs = map[string]string{}
			
 
				 
			
 
				 const pathComponentCount = 6
			
 
				 
			
 
				-// getShims returns an ordered list of shims to try, starting with the best
			
 
				-func getShims(gpuInfo gpu.GpuInfo) []string {
			
 
				+// getDynLibs returns an ordered list of LLM libraries to try, starting with the best
			
 
				+func getDynLibs(gpuInfo gpu.GpuInfo) []string {
			
 
				 	// Short circuit if we know we're using the default built-in (darwin only)
			
 
				 	if gpuInfo.Library == "default" {
			
 
				 		return []string{"default"}
			
 
				 	}
			
 
				 
			
 
				 	exactMatch := ""
			
 
				-	shims := []string{}
			
 
				-	altShims := []string{}
			
 
				+	dynLibs := []string{}
			
 
				+	altDynLibs := []string{}
			
 
				 	requested := gpuInfo.Library
			
 
				 	if gpuInfo.Variant != "" {
			
 
				 		requested += "_" + gpuInfo.Variant
			
 
				 	}
			
 
				 	// Try to find an exact match
			
 
				-	for cmp := range availableShims {
			
 
				+	for cmp := range availableDynLibs {
			
 
				 		if requested == cmp {
			
 
				 			exactMatch = cmp
			
 
				-			shims = []string{availableShims[cmp]}
			
 
				+			dynLibs = []string{availableDynLibs[cmp]}
			
 
				 			break
			
 
				 		}
			
 
				 	}
			
 
				 	// Then for GPUs load alternates and sort the list for consistent load ordering
			
 
				 	if gpuInfo.Library != "cpu" {
			
 
				-		for cmp := range availableShims {
			
 
				+		for cmp := range availableDynLibs {
			
 
				 			if gpuInfo.Library == strings.Split(cmp, "_")[0] && cmp != exactMatch {
			
 
				-				altShims = append(altShims, cmp)
			
 
				+				altDynLibs = append(altDynLibs, cmp)
			
 
				 			}
			
 
				 		}
			
 
				-		slices.Sort(altShims)
			
 
				-		for _, altShim := range altShims {
			
 
				-			shims = append(shims, availableShims[altShim])
			
 
				+		slices.Sort(altDynLibs)
			
 
				+		for _, altDynLib := range altDynLibs {
			
 
				+			dynLibs = append(dynLibs, availableDynLibs[altDynLib])
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -65,27 +65,27 @@ func getShims(gpuInfo gpu.GpuInfo) []string {
 
				 		// Attempting to run the wrong CPU instructions will panic the
			
 
				 		// process
			
 
				 		if variant != "" {
			
 
				-			for cmp := range availableShims {
			
 
				+			for cmp := range availableDynLibs {
			
 
				 				if cmp == "cpu_"+variant {
			
 
				-					shims = append(shims, availableShims[cmp])
			
 
				+					dynLibs = append(dynLibs, availableDynLibs[cmp])
			
 
				 					break
			
 
				 				}
			
 
				 			}
			
 
				 		} else {
			
 
				-			shims = append(shims, availableShims["cpu"])
			
 
				+			dynLibs = append(dynLibs, availableDynLibs["cpu"])
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	// Finaly, if we didn't find any matches, LCD CPU FTW
			
 
				-	if len(shims) == 0 {
			
 
				-		shims = []string{availableShims["cpu"]}
			
 
				+	if len(dynLibs) == 0 {
			
 
				+		dynLibs = []string{availableDynLibs["cpu"]}
			
 
				 	}
			
 
				-	return shims
			
 
				+	return dynLibs
			
 
				 }
			
 
				 
			
 
				-func rocmShimPresent() bool {
			
 
				-	for shimName := range availableShims {
			
 
				-		if strings.HasPrefix(shimName, "rocm") {
			
 
				+func rocmDynLibPresent() bool {
			
 
				+	for dynLibName := range availableDynLibs {
			
 
				+		if strings.HasPrefix(dynLibName, "rocm") {
			
 
				 			return true
			
 
				 		}
			
 
				 	}
			
@@ -104,7 +104,6 @@ func nativeInit(workdir string) error {
 
				 			return err
			
 
				 		}
			
 
				 		os.Setenv("GGML_METAL_PATH_RESOURCES", workdir)
			
 
				-		return nil
			
 
				 	}
			
 
				 
			
 
				 	libs, err := extractDynamicLibs(workdir, "llama.cpp/build/*/*/lib/*")
			
@@ -118,7 +117,7 @@ func nativeInit(workdir string) error {
 
				 	for _, lib := range libs {
			
 
				 		// The last dir component is the variant name
			
 
				 		variant := filepath.Base(filepath.Dir(lib))
			
 
				-		availableShims[variant] = lib
			
 
				+		availableDynLibs[variant] = lib
			
 
				 	}
			
 
				 
			
 
				 	if err := verifyDriverAccess(); err != nil {
			
@@ -126,9 +125,9 @@ func nativeInit(workdir string) error {
 
				 	}
			
 
				 
			
 
				 	// Report which dynamic libraries we have loaded to assist troubleshooting
			
 
				-	variants := make([]string, len(availableShims))
			
 
				+	variants := make([]string, len(availableDynLibs))
			
 
				 	i := 0
			
 
				-	for variant := range availableShims {
			
 
				+	for variant := range availableDynLibs {
			
 
				 		variants[i] = variant
			
 
				 		i++
			
 
				 	}
			
@@ -226,7 +225,7 @@ func verifyDriverAccess() error {
 
				 		return nil
			
 
				 	}
			
 
				 	// Only check ROCm access if we have the dynamic lib loaded
			
 
				-	if rocmShimPresent() {
			
 
				+	if rocmDynLibPresent() {
			
 
				 		// Verify we have permissions - either running as root, or we have group access to the driver
			
 
				 		fd, err := os.OpenFile("/dev/kfd", os.O_RDWR, 0666)
			
 
				 		if err != nil {
			
--- a/llm/payload_darwin.go
+++ b/llm/payload_darwin.go
@@ -0,0 +1,8 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"embed"
			
 
				+)
			
 
				+
			
 
				+//go:embed llama.cpp/ggml-metal.metal llama.cpp/build/darwin/*/lib/*.so
			
 
				+var libEmbed embed.FS
			
--- a/llm/payload_linux.go
+++ b/llm/payload_linux.go
@@ -0,0 +1,8 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"embed"
			
 
				+)
			
 
				+
			
 
				+//go:embed llama.cpp/build/linux/*/lib/*.so
			
 
				+var libEmbed embed.FS
			
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -0,0 +1,54 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/jmorganca/ollama/gpu"
			
 
				+	"github.com/stretchr/testify/assert"
			
 
				+)
			
 
				+
			
 
				+func TestGetDynLibs(t *testing.T) {
			
 
				+	availableDynLibs = map[string]string{
			
 
				+		"cpu": "X_cpu",
			
 
				+	}
			
 
				+	assert.Equal(t, false, rocmDynLibPresent())
			
 
				+	res := getDynLibs(gpu.GpuInfo{Library: "cpu"})
			
 
				+	assert.Len(t, res, 1)
			
 
				+	assert.Equal(t, availableDynLibs["cpu"], res[0])
			
 
				+
			
 
				+	availableDynLibs = map[string]string{
			
 
				+		"rocm_v5": "X_rocm_v5",
			
 
				+		"rocm_v6": "X_rocm_v6",
			
 
				+		"cpu":     "X_cpu",
			
 
				+	}
			
 
				+	assert.Equal(t, true, rocmDynLibPresent())
			
 
				+	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
			
 
				+	assert.Len(t, res, 3)
			
 
				+	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
			
 
				+	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"], res[2])
			
 
				+
			
 
				+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				+	assert.Len(t, res, 3)
			
 
				+	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
			
 
				+	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"], res[2])
			
 
				+
			
 
				+	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
			
 
				+	assert.Len(t, res, 1)
			
 
				+	assert.Equal(t, availableDynLibs["cpu"], res[0])
			
 
				+
			
 
				+	res = getDynLibs(gpu.GpuInfo{Library: "default"})
			
 
				+	assert.Len(t, res, 1)
			
 
				+	assert.Equal(t, "default", res[0])
			
 
				+
			
 
				+	availableDynLibs = map[string]string{
			
 
				+		"rocm": "X_rocm_v5",
			
 
				+		"cpu":  "X_cpu",
			
 
				+	}
			
 
				+	assert.Equal(t, true, rocmDynLibPresent())
			
 
				+	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				+	assert.Len(t, res, 2)
			
 
				+	assert.Equal(t, availableDynLibs["rocm"], res[0])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"], res[1])
			
 
				+}
			
--- a/llm/payload_windows.go
+++ b/llm/payload_windows.go
@@ -0,0 +1,8 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"embed"
			
 
				+)
			
 
				+
			
 
				+//go:embed llama.cpp/build/windows/*/lib/*.dll
			
 
				+var libEmbed embed.FS
			
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -1,16 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"embed"
			
 
				-	"fmt"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				-)
			
 
				-
			
 
				-//go:embed llama.cpp/ggml-metal.metal
			
 
				-var libEmbed embed.FS
			
 
				-
			
 
				-func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				-	// should never happen...
			
 
				-	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
			
 
				-}
			
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -1,107 +0,0 @@
 
				-//go:build !darwin
			
 
				-
			
 
				-package llm
			
 
				-
			
 
				-/*
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-#include "dynamic_shim.h"
			
 
				-
			
 
				-*/
			
 
				-import "C"
			
 
				-import (
			
 
				-	"context"
			
 
				-	"fmt"
			
 
				-	"log"
			
 
				-	"path/filepath"
			
 
				-	"sync"
			
 
				-	"unsafe"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				-)
			
 
				-
			
 
				-type shimExtServer struct {
			
 
				-	s       C.struct_dynamic_llama_server
			
 
				-	options api.Options
			
 
				-}
			
 
				-
			
 
				-// Note: current implementation does not support concurrent instantiations
			
 
				-var shimMutex sync.Mutex
			
 
				-var llm *shimExtServer
			
 
				-
			
 
				-func (llm *shimExtServer) llama_server_init(sparams *C.ext_server_params_t, err *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_init(llm.s, sparams, err)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_start() {
			
 
				-	C.dynamic_shim_llama_server_start(llm.s)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_stop() {
			
 
				-	C.dynamic_shim_llama_server_stop(llm.s)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) llama_server_completion(json_req *C.char, resp *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_completion(llm.s, json_req, resp)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_completion_next_result(task_id C.int, resp *C.ext_server_task_result_t) {
			
 
				-	C.dynamic_shim_llama_server_completion_next_result(llm.s, task_id, resp)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_completion_cancel(task_id C.int, err *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_completion_cancel(llm.s, task_id, err)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_release_task_result(result *C.ext_server_task_result_t) {
			
 
				-	C.dynamic_shim_llama_server_release_task_result(llm.s, result)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) llama_server_tokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_tokenize(llm.s, json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_detokenize(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_detokenize(llm.s, json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_embedding(json_req *C.char, json_resp **C.char, err *C.ext_server_resp_t) {
			
 
				-	C.dynamic_shim_llama_server_embedding(llm.s, json_req, json_resp, err)
			
 
				-}
			
 
				-func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
			
 
				-	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
			
 
				-}
			
 
				-
			
 
				-func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				-	shimMutex.Lock()
			
 
				-	defer shimMutex.Unlock()
			
 
				-	updatePath(filepath.Dir(library))
			
 
				-	libPath := C.CString(library)
			
 
				-	defer C.free(unsafe.Pointer(libPath))
			
 
				-	resp := newExtServerResp(128)
			
 
				-	defer freeExtServerResp(resp)
			
 
				-	var srv C.struct_dynamic_llama_server
			
 
				-	C.dynamic_shim_init(libPath, &srv, &resp)
			
 
				-	if resp.id < 0 {
			
 
				-		return nil, fmt.Errorf("Unable to load dynamic library: %s", C.GoString(resp.msg))
			
 
				-	}
			
 
				-	llm = &shimExtServer{
			
 
				-		s:       srv,
			
 
				-		options: opts,
			
 
				-	}
			
 
				-	log.Printf("Loading Dynamic Shim llm server: %s", library)
			
 
				-	return newExtServer(llm, model, adapters, projectors, opts)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
			
 
				-	return predict(ctx, llm, pred, fn)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) Encode(ctx context.Context, prompt string) ([]int, error) {
			
 
				-	return encode(llm, ctx, prompt)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) Decode(ctx context.Context, tokens []int) (string, error) {
			
 
				-	return decode(llm, ctx, tokens)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) Embedding(ctx context.Context, input string) ([]float64, error) {
			
 
				-	return embedding(llm, ctx, input)
			
 
				-}
			
 
				-
			
 
				-func (llm *shimExtServer) Close() {
			
 
				-	close(llm)
			
 
				-}
			
--- a/llm/shim_ext_server_linux.go
+++ b/llm/shim_ext_server_linux.go
@@ -1,23 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"embed"
			
 
				-	"log"
			
 
				-	"os"
			
 
				-	"strings"
			
 
				-)
			
 
				-
			
 
				-//go:embed llama.cpp/build/*/*/lib/*.so
			
 
				-var libEmbed embed.FS
			
 
				-
			
 
				-func updatePath(dir string) {
			
 
				-	pathComponents := strings.Split(os.Getenv("LD_LIBRARY_PATH"), ":")
			
 
				-	for _, comp := range pathComponents {
			
 
				-		if comp == dir {
			
 
				-			return
			
 
				-		}
			
 
				-	}
			
 
				-	newPath := strings.Join(append([]string{dir}, pathComponents...), ":")
			
 
				-	log.Printf("Updating LD_LIBRARY_PATH to %s", newPath)
			
 
				-	os.Setenv("LD_LIBRARY_PATH", newPath)
			
 
				-}
			
--- a/llm/shim_ext_server_windows.go
+++ b/llm/shim_ext_server_windows.go
@@ -1,31 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"embed"
			
 
				-	"log"
			
 
				-	"os"
			
 
				-	"path/filepath"
			
 
				-	"strings"
			
 
				-)
			
 
				-
			
 
				-//go:embed llama.cpp/build/windows/*/lib/*.dll
			
 
				-var libEmbed embed.FS
			
 
				-
			
 
				-func updatePath(dir string) {
			
 
				-	tmpDir := filepath.Dir(dir)
			
 
				-	pathComponents := strings.Split(os.Getenv("PATH"), ";")
			
 
				-	i := 0
			
 
				-	for _, comp := range pathComponents {
			
 
				-		if strings.EqualFold(comp, dir) {
			
 
				-			return
			
 
				-		}
			
 
				-		// Remove any other prior paths to our temp dir
			
 
				-		if !strings.HasPrefix(strings.ToLower(comp), strings.ToLower(tmpDir)) {
			
 
				-			pathComponents[i] = comp
			
 
				-			i++
			
 
				-		}
			
 
				-	}
			
 
				-	newPath := strings.Join(append([]string{dir}, pathComponents...), ";")
			
 
				-	log.Printf("Updating PATH to %s", newPath)
			
 
				-	os.Setenv("PATH", newPath)
			
 
				-}
			
--- a/llm/shim_test.go
+++ b/llm/shim_test.go
@@ -1,54 +0,0 @@
 
				-package llm
			
 
				-
			
 
				-import (
			
 
				-	"testing"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/gpu"
			
 
				-	"github.com/stretchr/testify/assert"
			
 
				-)
			
 
				-
			
 
				-func TestGetShims(t *testing.T) {
			
 
				-	availableShims = map[string]string{
			
 
				-		"cpu": "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, false, rocmShimPresent())
			
 
				-	res := getShims(gpu.GpuInfo{Library: "cpu"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, availableShims["cpu"], res[0])
			
 
				-
			
 
				-	availableShims = map[string]string{
			
 
				-		"rocm_v5": "X_rocm_v5",
			
 
				-		"rocm_v6": "X_rocm_v6",
			
 
				-		"cpu":     "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, true, rocmShimPresent())
			
 
				-	res = getShims(gpu.GpuInfo{Library: "rocm"})
			
 
				-	assert.Len(t, res, 3)
			
 
				-	assert.Equal(t, availableShims["rocm_v5"], res[0])
			
 
				-	assert.Equal(t, availableShims["rocm_v6"], res[1])
			
 
				-	assert.Equal(t, availableShims["cpu"], res[2])
			
 
				-
			
 
				-	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				-	assert.Len(t, res, 3)
			
 
				-	assert.Equal(t, availableShims["rocm_v6"], res[0])
			
 
				-	assert.Equal(t, availableShims["rocm_v5"], res[1])
			
 
				-	assert.Equal(t, availableShims["cpu"], res[2])
			
 
				-
			
 
				-	res = getShims(gpu.GpuInfo{Library: "cuda"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, availableShims["cpu"], res[0])
			
 
				-
			
 
				-	res = getShims(gpu.GpuInfo{Library: "default"})
			
 
				-	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, "default", res[0])
			
 
				-
			
 
				-	availableShims = map[string]string{
			
 
				-		"rocm": "X_rocm_v5",
			
 
				-		"cpu":  "X_cpu",
			
 
				-	}
			
 
				-	assert.Equal(t, true, rocmShimPresent())
			
 
				-	res = getShims(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				-	assert.Len(t, res, 2)
			
 
				-	assert.Equal(t, availableShims["rocm"], res[0])
			
 
				-	assert.Equal(t, availableShims["cpu"], res[1])
			
 
				-}