1 年之前 · 7427fa1387
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -34,7 +34,7 @@ func GetGPUInfo() GpuInfo {
 
				 	mem, _ := getCPUMem()
			
 
				 	if runtime.GOARCH == "amd64" {
			
 
				 		return GpuInfo{
			
 
				-			Library: "default",
			
 
				+			Library: "cpu",
			
 
				 			Variant: GetCPUVariant(),
			
 
				 			memInfo: mem,
			
 
				 		}
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -2,6 +2,7 @@ package llm
 
				 
			
 
				 import (
			
 
				 	"context"
			
 
				+	"fmt"
			
 
				 	"log"
			
 
				 	"os"
			
 
				 	"runtime"
			
@@ -50,7 +51,6 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 	graph := int64(ggml.NumGQA()) * kv / 6
			
 
				 
			
 
				 	info := gpu.GetGPUInfo()
			
 
				-	library := info.Library
			
 
				 	switch runtime.GOOS {
			
 
				 	case "darwin":
			
 
				 		if opts.NumGPU == 0 {
			
@@ -59,13 +59,15 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 
			
 
				 		if size+kv+graph > vram {
			
 
				 			log.Println("not enough vram available, falling back to CPU only")
			
 
				+			info.Library = "cpu"
			
 
				+			info.Variant = gpu.GetCPUVariant()
			
 
				 			opts.NumGPU = 0
			
 
				 			break
			
 
				 		}
			
 
				 
			
 
				 		opts.NumGPU = 1
			
 
				 	default:
			
 
				-		if library == "cpu" || library == "default" {
			
 
				+		if info.Library == "cpu" {
			
 
				 			log.Println("GPU not available, falling back to CPU")
			
 
				 			opts.NumGPU = 0
			
 
				 			break
			
@@ -73,7 +75,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 
			
 
				 		// don't use GPU at all if no layers are loaded
			
 
				 		if opts.NumGPU == 0 {
			
 
				-			library = "cpu"
			
 
				+			info.Library = "cpu"
			
 
				+			info.Variant = gpu.GetCPUVariant()
			
 
				 			break
			
 
				 		}
			
 
				 
			
@@ -100,7 +103,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 		min := graph + kv*layers/maxlayers
			
 
				 		if layers <= 0 || min > avg {
			
 
				 			log.Printf("not enough vram available, falling back to CPU only")
			
 
				-			library = "cpu"
			
 
				+			info.Library = "cpu"
			
 
				+			info.Variant = gpu.GetCPUVariant()
			
 
				 			opts.NumGPU = 0
			
 
				 			break
			
 
				 		}
			
@@ -110,8 +114,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 
			
 
				 	opts.RopeFrequencyBase = 0.0
			
 
				 	opts.RopeFrequencyScale = 0.0
			
 
				-	gpuInfo := gpu.GetGPUInfo()
			
 
				-	return newLlmServer(gpuInfo, model, adapters, projectors, opts)
			
 
				+	return newLlmServer(info, model, adapters, projectors, opts)
			
 
				 }
			
 
				 
			
 
				 // Give any native cgo implementations an opportunity to initialize
			
--- a/llm/payload_common.go
+++ b/llm/payload_common.go
@@ -28,6 +28,13 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
 
				 	if gpuInfo.Library == "default" {
			
 
				 		return []string{"default"}
			
 
				 	}
			
 
				+	// TODO - temporary until we have multiple CPU variations for Darwin
			
 
				+	// Short circuit on darwin with metal only
			
 
				+	if len(availableDynLibs) == 1 {
			
 
				+		if _, onlyMetal := availableDynLibs["metal"]; onlyMetal {
			
 
				+			return []string{availableDynLibs["metal"]}
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	exactMatch := ""
			
 
				 	dynLibs := []string{}
			
--- a/llm/payload_test.go
+++ b/llm/payload_test.go
@@ -16,39 +16,43 @@ func TestGetDynLibs(t *testing.T) {
 
				 	assert.Len(t, res, 1)
			
 
				 	assert.Equal(t, availableDynLibs["cpu"], res[0])
			
 
				 
			
 
				+	variant := gpu.GetCPUVariant()
			
 
				+	if variant != "" {
			
 
				+		variant = "_" + variant
			
 
				+	}
			
 
				 	availableDynLibs = map[string]string{
			
 
				-		"rocm_v5": "X_rocm_v5",
			
 
				-		"rocm_v6": "X_rocm_v6",
			
 
				-		"cpu":     "X_cpu",
			
 
				+		"rocm_v5":       "X_rocm_v5",
			
 
				+		"rocm_v6":       "X_rocm_v6",
			
 
				+		"cpu" + variant: "X_cpu",
			
 
				 	}
			
 
				 	assert.Equal(t, true, rocmDynLibPresent())
			
 
				 	res = getDynLibs(gpu.GpuInfo{Library: "rocm"})
			
 
				 	assert.Len(t, res, 3)
			
 
				 	assert.Equal(t, availableDynLibs["rocm_v5"], res[0])
			
 
				 	assert.Equal(t, availableDynLibs["rocm_v6"], res[1])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"], res[2])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
			
 
				 
			
 
				 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				 	assert.Len(t, res, 3)
			
 
				 	assert.Equal(t, availableDynLibs["rocm_v6"], res[0])
			
 
				 	assert.Equal(t, availableDynLibs["rocm_v5"], res[1])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"], res[2])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"+variant], res[2])
			
 
				 
			
 
				 	res = getDynLibs(gpu.GpuInfo{Library: "cuda"})
			
 
				 	assert.Len(t, res, 1)
			
 
				-	assert.Equal(t, availableDynLibs["cpu"], res[0])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"+variant], res[0])
			
 
				 
			
 
				 	res = getDynLibs(gpu.GpuInfo{Library: "default"})
			
 
				 	assert.Len(t, res, 1)
			
 
				 	assert.Equal(t, "default", res[0])
			
 
				 
			
 
				 	availableDynLibs = map[string]string{
			
 
				-		"rocm": "X_rocm_v5",
			
 
				-		"cpu":  "X_cpu",
			
 
				+		"rocm":          "X_rocm_v5",
			
 
				+		"cpu" + variant: "X_cpu",
			
 
				 	}
			
 
				 	assert.Equal(t, true, rocmDynLibPresent())
			
 
				 	res = getDynLibs(gpu.GpuInfo{Library: "rocm", Variant: "v6"})
			
 
				 	assert.Len(t, res, 2)
			
 
				 	assert.Equal(t, availableDynLibs["rocm"], res[0])
			
 
				-	assert.Equal(t, availableDynLibs["cpu"], res[1])
			
 
				+	assert.Equal(t, availableDynLibs["cpu"+variant], res[1])
			
 
				 }