1 jaar geleden · 2540c9181c
--- a/docs/development.md
+++ b/docs/development.md
@@ -35,5 +35,5 @@ Now you can run `ollama`:
 
															 ## Building on Linux with GPU support
														
 
															 - Install cmake and nvidia-cuda-toolkit
														
 
															-- run `go generate ./...`
														
 
															+- run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...`
														
 
															 - run `go build .`
														
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -4,7 +4,6 @@ import (
 
															 	"encoding/binary"
														
 
															 	"errors"
														
 
															 	"io"
														
 
															-	"path"
														
 
															 	"sync"
														
 
															 )
														
@@ -166,11 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
 
															 	return nil, nil
														
 
															 }
														
 
															-var (
														
 
															-	ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
														
 
															-	ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
														
 
															-)
														
 
															-
														
 
															 var (
														
 
															 	ggmlInit       sync.Once
														
 
															 	ggmlRunnerPath string
														
@@ -178,7 +172,7 @@ var (
 
															 func ggmlRunner() ModelRunner {
														
 
															 	ggmlInit.Do(func() {
														
 
															-		ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
														
 
															+		ggmlRunnerPath = chooseRunner("ggml")
														
 
															 	})
														
 
															 	return ModelRunner{Path: ggmlRunnerPath}
														
 
															 }
														
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -6,7 +6,6 @@ import (
 
															 	"errors"
														
 
															 	"fmt"
														
 
															 	"io"
														
 
															-	"path"
														
 
															 	"sync"
														
 
															 )
														
@@ -370,11 +369,6 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
 
															 	return
														
 
															 }
														
 
															-var (
														
 
															-	ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
														
 
															-	ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
														
 
															-)
														
 
															-
														
 
															 var (
														
 
															 	ggufInit       sync.Once
														
 
															 	ggufRunnerPath string
														
@@ -382,7 +376,7 @@ var (
 
															 func ggufRunner() ModelRunner {
														
 
															 	ggufInit.Do(func() {
														
 
															-		ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
														
 
															+		ggufRunnerPath = chooseRunner("gguf")
														
 
															 	})
														
 
															 	return ModelRunner{Path: ggufRunnerPath}
														
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -7,9 +7,15 @@ package llm
 
															 //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
														
 
															 //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
														
 
															 //go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
														
 
															-//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
														
 
															-//go:generate cmake --build ggml/build/gpu --target server --config Release
														
 
															+
														
 
															+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
														
 
															+//go:generate cmake --build ggml/build/cpu --target server --config Release
														
 
															 //go:generate git submodule update --force gguf
														
 
															-//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
														
 
															-//go:generate cmake --build gguf/build/gpu --target server --config Release
														
 
															+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
														
 
															+//go:generate cmake --build gguf/build/cpu --target server --config Release
														
 
															+
														
 
															+//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
														
 
															+//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release
														
 
															+//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
														
 
															+//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release
														
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -17,6 +17,7 @@ import (
 
															 	"os/exec"
														
 
															 	"path"
														
 
															 	"path/filepath"
														
 
															+	"regexp"
														
 
															 	"runtime"
														
 
															 	"strconv"
														
 
															 	"strings"
														
@@ -36,36 +37,99 @@ func osPath(llamaPath string) string {
 
															 	return llamaPath
														
 
															 }
														
 
															-func chooseRunner(gpuPath, cpuPath string) string {
														
 
															-	tmpDir, err := os.MkdirTemp("", "llama-*")
														
 
															+func cudaVersion() (int, error) {
														
 
															+	// first try nvcc, it gives the most accurate version if available
														
 
															+	cmd := exec.Command("nvcc", "--version")
														
 
															+	output, err := cmd.CombinedOutput()
														
 
															+	if err == nil {
														
 
															+		// regex to match the CUDA version line in nvcc --version output
														
 
															+		re := regexp.MustCompile(`release (\d+\.\d+),`)
														
 
															+		matches := re.FindStringSubmatch(string(output))
														
 
															+		if len(matches) >= 2 {
														
 
															+			cudaVersion := matches[1]
														
 
															+			cudaVersionParts := strings.Split(cudaVersion, ".")
														
 
															+			cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
														
 
															+			if err == nil {
														
 
															+				return cudaMajorVersion, nil
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	// fallback to nvidia-smi
														
 
															+	cmd = exec.Command("nvidia-smi")
														
 
															+	output, err = cmd.CombinedOutput()
														
 
															 	if err != nil {
														
 
															-		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
														
 
															+		return -1, err
														
 
															 	}
														
 
															-	llamaPath := osPath(gpuPath)
														
 
															-	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
														
 
															-		llamaPath = osPath(cpuPath)
														
 
															-		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
														
 
															-			log.Fatalf("llama.cpp executable not found")
														
 
															-		}
														
 
															+	re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
														
 
															+	matches := re.FindStringSubmatch(string(output))
														
 
															+	if len(matches) < 2 {
														
 
															+		return -1, errors.New("could not find CUDA version")
														
 
															+	}
														
 
															+
														
 
															+	cudaVersion := matches[1]
														
 
															+	cudaVersionParts := strings.Split(cudaVersion, ".")
														
 
															+	cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
														
 
															+	if err != nil {
														
 
															+		return -1, err
														
 
															+	}
														
 
															+	return cudaMajorVersion, nil
														
 
															+}
														
 
															+
														
 
															+func chooseRunner(runnerType string) string {
														
 
															+	tmpDir, err := os.MkdirTemp("", "llama-*")
														
 
															+	if err != nil {
														
 
															+		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
														
 
															 	}
														
 
															+	cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin"))
														
 
															+	llamaPath := cpuPath
														
 
															 	files := []string{"server"}
														
 
															+
														
 
															+	// Set OS specific llama.cpp runner paths
														
 
															 	switch runtime.GOOS {
														
 
															-	case "windows":
														
 
															-		files = []string{"server.exe"}
														
 
															 	case "darwin":
														
 
															-		if llamaPath == osPath(gpuPath) {
														
 
															-			files = append(files, "ggml-metal.metal")
														
 
															-		}
														
 
															+		// TODO: change to check metal version
														
 
															+		llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin"))
														
 
															+		files = append(files, "ggml-metal.metal")
														
 
															 	case "linux":
														
 
															-		// check if there is a GPU available
														
 
															-		if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
														
 
															-			// this error was logged on start-up, so we don't need to log it again
														
 
															-			llamaPath = osPath(cpuPath)
														
 
															+		cudaVersion, err := cudaVersion()
														
 
															+		if err != nil {
														
 
															+			// fallback to CPU runner in the following the CUDA version check
														
 
															+			log.Printf("failed to get CUDA version: %v", err)
														
 
															+		}
														
 
															+
														
 
															+		switch cudaVersion {
														
 
															+		case 11, 12:
														
 
															+			cudaDir := fmt.Sprintf("cuda-%d", cudaVersion)
														
 
															+			llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin"))
														
 
															+		default:
														
 
															+			if cudaVersion != -1 {
														
 
															+				// a valid version was returned but it is not supported
														
 
															+				log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
														
 
															+			}
														
 
															+			llamaPath = cpuPath
														
 
															+		}
														
 
															+	case "windows":
														
 
															+		// TODO: select windows GPU runner here when available
														
 
															+		files = []string{"server.exe"}
														
 
															+	default:
														
 
															+		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
														
 
															+	}
														
 
															+
														
 
															+	// check if the runner exists, if not fallback to CPU runner
														
 
															+	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
														
 
															+		// fallback to CPU runner
														
 
															+		llamaPath = cpuPath
														
 
															+		files = []string{"server"}
														
 
															+		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
														
 
															+			log.Fatalf("llama.cpp executable not found")
														
 
															 		}
														
 
															+		log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
														
 
															 	}
														
 
															+	// copy the files locally to run the llama.cpp server
														
 
															 	for _, f := range files {
														
 
															 		srcPath := path.Join(llamaPath, f)
														
 
															 		destPath := filepath.Join(tmpDir, f)