há 1 ano atrás · 2540c9181c
--- a/docs/development.md
+++ b/docs/development.md
@@ -35,5 +35,5 @@ Now you can run `ollama`:
 
				 ## Building on Linux with GPU support
			
 
				 
			
 
				 - Install cmake and nvidia-cuda-toolkit
			
 
				-- run `go generate ./...`
			
 
				+- run `CUDA_VERSION=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\)\.\([0-9]\+\).*$/\1/p') go generate ./...`
			
 
				 - run `go build .`
			
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -4,7 +4,6 @@ import (
 
				 	"encoding/binary"
			
 
				 	"errors"
			
 
				 	"io"
			
 
				-	"path"
			
 
				 	"sync"
			
 
				 )
			
 
				 
			
@@ -166,11 +165,6 @@ func (c *containerLORA) Decode(r io.Reader) (model, error) {
 
				 	return nil, nil
			
 
				 }
			
 
				 
			
 
				-var (
			
 
				-	ggmlGPU = path.Join("llama.cpp", "ggml", "build", "gpu", "bin")
			
 
				-	ggmlCPU = path.Join("llama.cpp", "ggml", "build", "cpu", "bin")
			
 
				-)
			
 
				-
			
 
				 var (
			
 
				 	ggmlInit       sync.Once
			
 
				 	ggmlRunnerPath string
			
@@ -178,7 +172,7 @@ var (
 
				 
			
 
				 func ggmlRunner() ModelRunner {
			
 
				 	ggmlInit.Do(func() {
			
 
				-		ggmlRunnerPath = chooseRunner(ggmlGPU, ggmlCPU)
			
 
				+		ggmlRunnerPath = chooseRunner("ggml")
			
 
				 	})
			
 
				 	return ModelRunner{Path: ggmlRunnerPath}
			
 
				 }
			
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -6,7 +6,6 @@ import (
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				-	"path"
			
 
				 	"sync"
			
 
				 )
			
 
				 
			
@@ -370,11 +369,6 @@ func (llm *ggufModel) readArray(r io.Reader) (arr []any, err error) {
 
				 	return
			
 
				 }
			
 
				 
			
 
				-var (
			
 
				-	ggufGPU = path.Join("llama.cpp", "gguf", "build", "gpu", "bin")
			
 
				-	ggufCPU = path.Join("llama.cpp", "gguf", "build", "cpu", "bin")
			
 
				-)
			
 
				-
			
 
				 var (
			
 
				 	ggufInit       sync.Once
			
 
				 	ggufRunnerPath string
			
@@ -382,7 +376,7 @@ var (
 
				 
			
 
				 func ggufRunner() ModelRunner {
			
 
				 	ggufInit.Do(func() {
			
 
				-		ggufRunnerPath = chooseRunner(ggufGPU, ggufCPU)
			
 
				+		ggufRunnerPath = chooseRunner("gguf")
			
 
				 	})
			
 
				 
			
 
				 	return ModelRunner{Path: ggufRunnerPath}
			
--- a/llm/llama.cpp/generate_linux.go
+++ b/llm/llama.cpp/generate_linux.go
@@ -7,9 +7,15 @@ package llm
 
				 //go:generate git-apply ../ggml_patch/0001-add-detokenize-endpoint.patch
			
 
				 //go:generate git-apply ../ggml_patch/0002-34B-model-support.patch
			
 
				 //go:generate git-apply ../ggml_patch/0005-ggml-support-CUDA-s-half-type-for-aarch64-1455-2670.patch
			
 
				-//go:generate cmake -S ggml -B ggml/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
			
 
				-//go:generate cmake --build ggml/build/gpu --target server --config Release
			
 
				+
			
 
				+//go:generate cmake -S ggml -B ggml/build/cpu -DLLAMA_K_QUANTS=on
			
 
				+//go:generate cmake --build ggml/build/cpu --target server --config Release
			
 
				 
			
 
				 //go:generate git submodule update --force gguf
			
 
				-//go:generate cmake -S gguf -B gguf/build/gpu -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
			
 
				-//go:generate cmake --build gguf/build/gpu --target server --config Release
			
 
				+//go:generate cmake -S gguf -B gguf/build/cpu -DLLAMA_K_QUANTS=on
			
 
				+//go:generate cmake --build gguf/build/cpu --target server --config Release
			
 
				+
			
 
				+//go:generate cmake -S ggml -B ggml/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
			
 
				+//go:generate cmake --build ggml/build/cuda-${CUDA_VERSION} --target server --config Release
			
 
				+//go:generate cmake -S gguf -B gguf/build/cuda-${CUDA_VERSION} -DLLAMA_CUBLAS=on -DLLAMA_ACCELERATE=on -DLLAMA_K_QUANTS=on
			
 
				+//go:generate cmake --build gguf/build/cuda-${CUDA_VERSION} --target server --config Release
			
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -17,6 +17,7 @@ import (
 
				 	"os/exec"
			
 
				 	"path"
			
 
				 	"path/filepath"
			
 
				+	"regexp"
			
 
				 	"runtime"
			
 
				 	"strconv"
			
 
				 	"strings"
			
@@ -36,36 +37,99 @@ func osPath(llamaPath string) string {
 
				 	return llamaPath
			
 
				 }
			
 
				 
			
 
				-func chooseRunner(gpuPath, cpuPath string) string {
			
 
				-	tmpDir, err := os.MkdirTemp("", "llama-*")
			
 
				+func cudaVersion() (int, error) {
			
 
				+	// first try nvcc, it gives the most accurate version if available
			
 
				+	cmd := exec.Command("nvcc", "--version")
			
 
				+	output, err := cmd.CombinedOutput()
			
 
				+	if err == nil {
			
 
				+		// regex to match the CUDA version line in nvcc --version output
			
 
				+		re := regexp.MustCompile(`release (\d+\.\d+),`)
			
 
				+		matches := re.FindStringSubmatch(string(output))
			
 
				+		if len(matches) >= 2 {
			
 
				+			cudaVersion := matches[1]
			
 
				+			cudaVersionParts := strings.Split(cudaVersion, ".")
			
 
				+			cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
			
 
				+			if err == nil {
			
 
				+				return cudaMajorVersion, nil
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// fallback to nvidia-smi
			
 
				+	cmd = exec.Command("nvidia-smi")
			
 
				+	output, err = cmd.CombinedOutput()
			
 
				 	if err != nil {
			
 
				-		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
			
 
				+		return -1, err
			
 
				 	}
			
 
				 
			
 
				-	llamaPath := osPath(gpuPath)
			
 
				-	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
			
 
				-		llamaPath = osPath(cpuPath)
			
 
				-		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
			
 
				-			log.Fatalf("llama.cpp executable not found")
			
 
				-		}
			
 
				+	re := regexp.MustCompile(`CUDA Version: (\d+\.\d+)`)
			
 
				+	matches := re.FindStringSubmatch(string(output))
			
 
				+	if len(matches) < 2 {
			
 
				+		return -1, errors.New("could not find CUDA version")
			
 
				+	}
			
 
				+
			
 
				+	cudaVersion := matches[1]
			
 
				+	cudaVersionParts := strings.Split(cudaVersion, ".")
			
 
				+	cudaMajorVersion, err := strconv.Atoi(cudaVersionParts[0])
			
 
				+	if err != nil {
			
 
				+		return -1, err
			
 
				+	}
			
 
				+	return cudaMajorVersion, nil
			
 
				+}
			
 
				+
			
 
				+func chooseRunner(runnerType string) string {
			
 
				+	tmpDir, err := os.MkdirTemp("", "llama-*")
			
 
				+	if err != nil {
			
 
				+		log.Fatalf("llama.cpp: failed to create temp dir: %v", err)
			
 
				 	}
			
 
				 
			
 
				+	cpuPath := osPath(path.Join("llama.cpp", runnerType, "build", "cpu", "bin"))
			
 
				+	llamaPath := cpuPath
			
 
				 	files := []string{"server"}
			
 
				+
			
 
				+	// Set OS specific llama.cpp runner paths
			
 
				 	switch runtime.GOOS {
			
 
				-	case "windows":
			
 
				-		files = []string{"server.exe"}
			
 
				 	case "darwin":
			
 
				-		if llamaPath == osPath(gpuPath) {
			
 
				-			files = append(files, "ggml-metal.metal")
			
 
				-		}
			
 
				+		// TODO: change to check metal version
			
 
				+		llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", "gpu", "bin"))
			
 
				+		files = append(files, "ggml-metal.metal")
			
 
				 	case "linux":
			
 
				-		// check if there is a GPU available
			
 
				-		if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
			
 
				-			// this error was logged on start-up, so we don't need to log it again
			
 
				-			llamaPath = osPath(cpuPath)
			
 
				+		cudaVersion, err := cudaVersion()
			
 
				+		if err != nil {
			
 
				+			// fallback to CPU runner in the following the CUDA version check
			
 
				+			log.Printf("failed to get CUDA version: %v", err)
			
 
				+		}
			
 
				+
			
 
				+		switch cudaVersion {
			
 
				+		case 11, 12:
			
 
				+			cudaDir := fmt.Sprintf("cuda-%d", cudaVersion)
			
 
				+			llamaPath = osPath(path.Join("llama.cpp", runnerType, "build", cudaDir, "bin"))
			
 
				+		default:
			
 
				+			if cudaVersion != -1 {
			
 
				+				// a valid version was returned but it is not supported
			
 
				+				log.Printf("CUDA version %d not supported, falling back to CPU", cudaVersion)
			
 
				+			}
			
 
				+			llamaPath = cpuPath
			
 
				+		}
			
 
				+	case "windows":
			
 
				+		// TODO: select windows GPU runner here when available
			
 
				+		files = []string{"server.exe"}
			
 
				+	default:
			
 
				+		log.Printf("unknown OS, running on CPU: %s", runtime.GOOS)
			
 
				+	}
			
 
				+
			
 
				+	// check if the runner exists, if not fallback to CPU runner
			
 
				+	if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
			
 
				+		// fallback to CPU runner
			
 
				+		llamaPath = cpuPath
			
 
				+		files = []string{"server"}
			
 
				+		if _, err := fs.Stat(llamaCppEmbed, llamaPath); err != nil {
			
 
				+			log.Fatalf("llama.cpp executable not found")
			
 
				 		}
			
 
				+		log.Printf("llama.cpp %s executable not found, falling back to cpu", runnerType)
			
 
				 	}
			
 
				 
			
 
				+	// copy the files locally to run the llama.cpp server
			
 
				 	for _, f := range files {
			
 
				 		srcPath := path.Join(llamaPath, f)
			
 
				 		destPath := filepath.Join(tmpDir, f)