1 year ago · 11d82d7b9b
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -24,6 +24,7 @@ import (
 
				 	"time"
			
 
				 
			
 
				 	"github.com/jmorganca/ollama/api"
			
 
				+	"github.com/jmorganca/ollama/format"
			
 
				 )
			
 
				 
			
 
				 //go:embed llama.cpp/*/build/*/bin/*
			
@@ -197,7 +198,7 @@ type llama struct {
 
				 
			
 
				 var errNoGPU = errors.New("nvidia-smi command failed")
			
 
				 
			
 
				-// CheckVRAM returns the available VRAM in MiB on Linux machines with NVIDIA GPUs
			
 
				+// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
			
 
				 func CheckVRAM() (int64, error) {
			
 
				 	cmd := exec.Command("nvidia-smi", "--query-gpu=memory.free", "--format=csv,noheader,nounits")
			
 
				 	var stdout bytes.Buffer
			
@@ -207,7 +208,7 @@ func CheckVRAM() (int64, error) {
 
				 		return 0, errNoGPU
			
 
				 	}
			
 
				 
			
 
				-	var free int64
			
 
				+	var freeMiB int64
			
 
				 	scanner := bufio.NewScanner(&stdout)
			
 
				 	for scanner.Scan() {
			
 
				 		line := scanner.Text()
			
@@ -216,15 +217,16 @@ func CheckVRAM() (int64, error) {
 
				 			return 0, fmt.Errorf("failed to parse available VRAM: %v", err)
			
 
				 		}
			
 
				 
			
 
				-		free += vram
			
 
				+		freeMiB += vram
			
 
				 	}
			
 
				 
			
 
				-	if free*1024*1024 < 2*1000*1000*1000 {
			
 
				+	freeBytes := freeMiB * 1024 * 1024
			
 
				+	if freeBytes < 2*format.GigaByte {
			
 
				 		log.Printf("less than 2 GB VRAM available, falling back to CPU only")
			
 
				-		free = 0
			
 
				+		freeMiB = 0
			
 
				 	}
			
 
				 
			
 
				-	return free, nil
			
 
				+	return freeBytes, nil
			
 
				 }
			
 
				 
			
 
				 func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
			
@@ -232,7 +234,7 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 
				 		return opts.NumGPU
			
 
				 	}
			
 
				 	if runtime.GOOS == "linux" {
			
 
				-		vramMib, err := CheckVRAM()
			
 
				+		freeBytes, err := CheckVRAM()
			
 
				 		if err != nil {
			
 
				 			if err.Error() != "nvidia-smi command failed" {
			
 
				 				log.Print(err.Error())
			
@@ -241,15 +243,13 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 
				 			return 0
			
 
				 		}
			
 
				 
			
 
				-		freeVramBytes := int64(vramMib) * 1024 * 1024 // 1 MiB = 1024^2 bytes
			
 
				-
			
 
				 		// Calculate bytes per layer
			
 
				 		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
			
 
				 		bytesPerLayer := fileSizeBytes / numLayer
			
 
				 
			
 
				 		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
			
 
				-		layers := int(freeVramBytes/bytesPerLayer) * 92 / 100
			
 
				-		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", vramMib, layers)
			
 
				+		layers := int(freeBytes/bytesPerLayer) * 92 / 100
			
 
				+		log.Printf("%d MiB VRAM available, loading up to %d GPU layers", freeBytes, layers)
			
 
				 
			
 
				 		return layers
			
 
				 	}