|
@@ -58,6 +58,12 @@ func chooseRunner(gpuPath, cpuPath string) string {
|
|
|
if llamaPath == osPath(gpuPath) {
|
|
|
files = append(files, "ggml-metal.metal")
|
|
|
}
|
|
|
+ case "linux":
|
|
|
+ // check if there is a GPU available
|
|
|
+ if _, err := CheckVRAM(); errors.Is(err, errNoGPU) {
|
|
|
+ // this error was logged on start-up, so we don't need to log it again
|
|
|
+ llamaPath = osPath(cpuPath)
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
for _, f := range files {
|
|
@@ -218,6 +224,72 @@ type llama struct {
|
|
|
Running
|
|
|
}
|
|
|
|
|
|
+var errNoGPU = errors.New("nvidia-smi command failed")
|
|
|
+
|
|
|
+// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
|
|
|
+func CheckVRAM() (int, error) {
|
|
|
+ cmd := exec.Command("nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits")
|
|
|
+ var stdout bytes.Buffer
|
|
|
+ cmd.Stdout = &stdout
|
|
|
+ err := cmd.Run()
|
|
|
+ if err != nil {
|
|
|
+ return 0, errNoGPU
|
|
|
+ }
|
|
|
+
|
|
|
+ var total int
|
|
|
+ scanner := bufio.NewScanner(&stdout)
|
|
|
+ for scanner.Scan() {
|
|
|
+ line := scanner.Text()
|
|
|
+ vram, err := strconv.Atoi(line)
|
|
|
+ if err != nil {
|
|
|
+ return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ total += vram
|
|
|
+ }
|
|
|
+
|
|
|
+ return total, nil
|
|
|
+}
|
|
|
+
|
|
|
+func NumGPU(opts api.Options) int {
|
|
|
+ if opts.NumGPU != -1 {
|
|
|
+ return opts.NumGPU
|
|
|
+ }
|
|
|
+ n := 1 // default to enable metal on macOS
|
|
|
+ if runtime.GOOS == "linux" {
|
|
|
+ vram, err := CheckVRAM()
|
|
|
+ if err != nil {
|
|
|
+ if err.Error() != "nvidia-smi command failed" {
|
|
|
+ log.Print(err.Error())
|
|
|
+ }
|
|
|
+ // nvidia driver not installed or no nvidia GPU found
|
|
|
+ return 0
|
|
|
+ }
|
|
|
+ // TODO: this is a very rough heuristic, better would be to calculate this based on number of layers and context size
|
|
|
+ switch {
|
|
|
+ case vram < 500:
|
|
|
+ log.Printf("WARNING: Low VRAM detected, disabling GPU")
|
|
|
+ n = 0
|
|
|
+ case vram < 1000:
|
|
|
+ n = 4
|
|
|
+ case vram < 2000:
|
|
|
+ n = 8
|
|
|
+ case vram < 4000:
|
|
|
+ n = 12
|
|
|
+ case vram < 8000:
|
|
|
+ n = 16
|
|
|
+ case vram < 12000:
|
|
|
+ n = 24
|
|
|
+ case vram < 16000:
|
|
|
+ n = 32
|
|
|
+ default:
|
|
|
+ n = 48
|
|
|
+ }
|
|
|
+ log.Printf("%d MB VRAM available, loading %d GPU layers", vram, n)
|
|
|
+ }
|
|
|
+ return n
|
|
|
+}
|
|
|
+
|
|
|
func newLlama(model string, adapters []string, runner ModelRunner, opts api.Options) (*llama, error) {
|
|
|
if _, err := os.Stat(model); err != nil {
|
|
|
return nil, err
|
|
@@ -237,7 +309,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|
|
"--rope-freq-base", fmt.Sprintf("%f", opts.RopeFrequencyBase),
|
|
|
"--rope-freq-scale", fmt.Sprintf("%f", opts.RopeFrequencyScale),
|
|
|
"--batch-size", fmt.Sprintf("%d", opts.NumBatch),
|
|
|
- "--n-gpu-layers", fmt.Sprintf("%d", opts.NumGPU),
|
|
|
+ "--n-gpu-layers", fmt.Sprintf("%d", NumGPU(opts)),
|
|
|
"--embedding",
|
|
|
}
|
|
|
|
|
@@ -305,7 +377,7 @@ func newLlama(model string, adapters []string, runner ModelRunner, opts api.Opti
|
|
|
func waitForServer(llm *llama) error {
|
|
|
// wait for the server to start responding
|
|
|
start := time.Now()
|
|
|
- expiresAt := time.Now().Add(30 * time.Second)
|
|
|
+ expiresAt := time.Now().Add(45 * time.Second)
|
|
|
ticker := time.NewTicker(200 * time.Millisecond)
|
|
|
|
|
|
log.Print("waiting for llama.cpp server to start responding")
|