1 year ago · 08f1e18965
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -16,8 +16,6 @@ import (
 
				 	"runtime"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
 
				-
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				 )
			
 
				 
			
 
				 type handles struct {
			
@@ -133,31 +131,14 @@ func getCPUMem() (memInfo, error) {
 
				 func CheckVRAM() (int64, error) {
			
 
				 	gpuInfo := GetGPUInfo()
			
 
				 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
			
 
				-		return int64(gpuInfo.FreeMemory), nil
			
 
				-	}
			
 
				-	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
			
 
				-}
			
 
				+		// allocate 384MiB for llama.cpp overhead (outside of model)
			
 
				+		overhead := uint64(384 * 1024 * 1024)
			
 
				+		if gpuInfo.FreeMemory <= overhead {
			
 
				+			return 0, nil
			
 
				+		}
			
 
				 
			
 
				-func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
			
 
				-	if opts.NumGPU != -1 {
			
 
				-		return opts.NumGPU
			
 
				+		return int64(gpuInfo.FreeMemory - overhead), nil
			
 
				 	}
			
 
				-	info := GetGPUInfo()
			
 
				-	if info.Library == "cpu" || info.Library == "default" {
			
 
				-		return 0
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-		Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
			
 
				-		We can store the model weights and the kv cache in vram,
			
 
				-		to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
			
 
				-	*/
			
 
				-	bytesPerLayer := uint64(fileSizeBytes / numLayer)
			
 
				-
			
 
				-	// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
			
 
				-	layers := int(info.FreeMemory/bytesPerLayer) * 3 / 4
			
 
				 
			
 
				-	log.Printf("%d MB VRAM available, loading up to %d %s GPU layers out of %d", info.FreeMemory/(1024*1024), layers, info.Library, numLayer)
			
 
				-
			
 
				-	return layers
			
 
				+	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
			
 
				 }
			
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -6,18 +6,31 @@ import "C"
 
				 import (
			
 
				 	"runtime"
			
 
				 
			
 
				-	"github.com/jmorganca/ollama/api"
			
 
				+	"github.com/pbnjay/memory"
			
 
				 )
			
 
				 
			
 
				 // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
			
 
				 func CheckVRAM() (int64, error) {
			
 
				-	// TODO - assume metal, and return free memory?
			
 
				-	return 0, nil
			
 
				+	if runtime.GOARCH == "amd64" {
			
 
				+		// gpu not supported, this may not be metal
			
 
				+		return 0, nil
			
 
				+	}
			
 
				+
			
 
				+	// on macOS, there's already buffer for available vram (see below) so just return the total
			
 
				+	systemMemory := int64(memory.TotalMemory())
			
 
				 
			
 
				+	// macOS limits how much memory is available to the GPU based on the amount of system memory
			
 
				+	// TODO: handle case where iogpu.wired_limit_mb is set to a higher value
			
 
				+	if systemMemory <= 36*1024*1024*1024 {
			
 
				+		systemMemory = systemMemory * 2 / 3
			
 
				+	} else {
			
 
				+		systemMemory = systemMemory * 3 / 4
			
 
				+	}
			
 
				+
			
 
				+	return systemMemory, nil
			
 
				 }
			
 
				 
			
 
				 func GetGPUInfo() GpuInfo {
			
 
				-	// TODO - Metal vs. x86 macs...
			
 
				 	mem, _ := getCPUMem()
			
 
				 	return GpuInfo{
			
 
				 		Library: "default",
			
@@ -32,19 +45,6 @@ func getCPUMem() (memInfo, error) {
 
				 	}, nil
			
 
				 }
			
 
				 
			
 
				-func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
			
 
				-	if opts.NumGPU != -1 {
			
 
				-		return opts.NumGPU
			
 
				-	}
			
 
				-
			
 
				-	// metal only supported on arm64
			
 
				-	if runtime.GOARCH == "arm64" {
			
 
				-		return 1
			
 
				-	}
			
 
				-
			
 
				-	return 0
			
 
				-}
			
 
				-
			
 
				 func nativeInit() error {
			
 
				 	return nil
			
 
				 }
			
--- a/llm/ext_server_common.go
+++ b/llm/ext_server_common.go
@@ -35,14 +35,12 @@ import (
 
				 	"encoding/json"
			
 
				 	"fmt"
			
 
				 	"log"
			
 
				-	"os"
			
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"time"
			
 
				 	"unsafe"
			
 
				 
			
 
				 	"github.com/jmorganca/ollama/api"
			
 
				-	"github.com/jmorganca/ollama/gpu"
			
 
				 )
			
 
				 
			
 
				 type extServer interface {
			
@@ -82,25 +80,20 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 
				 	return fmt.Errorf(C.GoString(resp.msg))
			
 
				 }
			
 
				 
			
 
				-func newExtServer(server extServer, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newExtServer(server extServer, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				 	if !mutex.TryLock() {
			
 
				 		log.Printf("concurrent llm servers not yet supported, waiting for prior server to complete")
			
 
				 		mutex.Lock()
			
 
				 	}
			
 
				-	fileInfo, err := os.Stat(model)
			
 
				-	if err != nil {
			
 
				-		return nil, err
			
 
				-	}
			
 
				+
			
 
				 	var sparams C.ext_server_params_t
			
 
				 	sparams.model = C.CString(model)
			
 
				 	defer C.free(unsafe.Pointer(sparams.model))
			
 
				 
			
 
				-	numGPU := gpu.NumGPU(numLayers, fileInfo.Size(), opts)
			
 
				-
			
 
				 	sparams.embedding = true
			
 
				 	sparams.n_ctx = C.uint(opts.NumCtx)
			
 
				 	sparams.n_batch = C.uint(opts.NumBatch)
			
 
				-	sparams.n_gpu_layers = C.int(numGPU)
			
 
				+	sparams.n_gpu_layers = C.int(opts.NumGPU)
			
 
				 	sparams.main_gpu = C.int(opts.MainGPU)
			
 
				 	sparams.n_parallel = 1 // TODO - wire up concurrency
			
 
				 
			
--- a/llm/ext_server_default.go
+++ b/llm/ext_server_default.go
@@ -54,9 +54,9 @@ func (llm *llamaExtServer) llama_server_release_json_resp(json_resp **C.char) {
 
				 	C.llama_server_release_json_resp(json_resp)
			
 
				 }
			
 
				 
			
 
				-func newDefaultExtServer(model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newDefaultExtServer(model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				 	server := &llamaExtServer{opts}
			
 
				-	return newExtServer(server, model, adapters, projectors, numLayers, opts)
			
 
				+	return newExtServer(server, model, adapters, projectors, opts)
			
 
				 }
			
 
				 
			
 
				 func (llm *llamaExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {
			
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -78,7 +78,11 @@ type model interface {
 
				 	ModelFamily() string
			
 
				 	ModelType() string
			
 
				 	FileType() string
			
 
				-	NumLayers() int64
			
 
				+	NumLayers() uint32
			
 
				+	NumGQA() uint32
			
 
				+	NumEmbed() uint32
			
 
				+	NumHead() uint32
			
 
				+	NumHeadKv() uint32
			
 
				 }
			
 
				 
			
 
				 type container interface {
			
--- a/llm/gguf.go
+++ b/llm/gguf.go
@@ -272,14 +272,49 @@ func (llm *ggufModel) Decode(rso *readSeekOffset) error {
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-func (llm *ggufModel) NumLayers() int64 {
			
 
				+func (llm *ggufModel) NumLayers() uint32 {
			
 
				 	value, exists := llm.kv[fmt.Sprintf("%s.block_count", llm.ModelFamily())]
			
 
				 	if !exists {
			
 
				 		return 0
			
 
				 	}
			
 
				 
			
 
				-	v := value.(uint32)
			
 
				-	return int64(v)
			
 
				+	return value.(uint32)
			
 
				+}
			
 
				+
			
 
				+func (llm *ggufModel) NumHead() uint32 {
			
 
				+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count", llm.ModelFamily())]
			
 
				+	if !exists {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	return value.(uint32)
			
 
				+}
			
 
				+
			
 
				+func (llm *ggufModel) NumEmbed() uint32 {
			
 
				+	value, exists := llm.kv[fmt.Sprintf("%s.embedding_length", llm.ModelFamily())]
			
 
				+	if !exists {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	return value.(uint32)
			
 
				+}
			
 
				+
			
 
				+func (llm *ggufModel) NumHeadKv() uint32 {
			
 
				+	value, exists := llm.kv[fmt.Sprintf("%s.attention.head_count_kv", llm.ModelFamily())]
			
 
				+	if !exists {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	return value.(uint32)
			
 
				+}
			
 
				+
			
 
				+func (llm *ggufModel) NumGQA() uint32 {
			
 
				+	numHeadKv := llm.NumHeadKv()
			
 
				+	if numHeadKv == 0 {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	return llm.NumHead() / numHeadKv
			
 
				 }
			
 
				 
			
 
				 func (llm ggufModel) readU8(r io.Reader) uint8 {
			
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -8,7 +8,6 @@ import (
 
				 	"fmt"
			
 
				 	"os"
			
 
				 	"os/exec"
			
 
				-	"sync"
			
 
				 	"time"
			
 
				 
			
 
				 	"github.com/jmorganca/ollama/api"
			
@@ -43,69 +42,11 @@ number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 
				 ws ::= ([ \t\n] ws)?
			
 
				 `
			
 
				 
			
 
				-type llamaModel struct {
			
 
				-	hyperparameters llamaHyperparameters
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaModel) ModelFamily() string {
			
 
				-	return "llama"
			
 
				-}
			
 
				-
			
 
				-func llamaModelType(numLayer uint32) string {
			
 
				-	switch numLayer {
			
 
				-	case 26:
			
 
				-		return "3B"
			
 
				-	case 32:
			
 
				-		return "7B"
			
 
				-	case 40:
			
 
				-		return "13B"
			
 
				-	case 48:
			
 
				-		return "34B"
			
 
				-	case 60:
			
 
				-		return "30B"
			
 
				-	case 80:
			
 
				-		return "65B"
			
 
				-	default:
			
 
				-		return "unknown"
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaModel) ModelType() string {
			
 
				-	return llamaModelType(llm.hyperparameters.NumLayer)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaModel) FileType() string {
			
 
				-	return fileType(llm.hyperparameters.FileType)
			
 
				-}
			
 
				-
			
 
				-func (llm *llamaModel) NumLayers() int64 {
			
 
				-	return int64(llm.hyperparameters.NumLayer)
			
 
				-}
			
 
				-
			
 
				-type llamaHyperparameters struct {
			
 
				-	// NumVocab is the size of the model's vocabulary.
			
 
				-	NumVocab uint32
			
 
				-
			
 
				-	// NumEmbd is the size of the model's embedding layer.
			
 
				-	NumEmbd uint32
			
 
				-	NumMult uint32
			
 
				-	NumHead uint32
			
 
				-
			
 
				-	// NumLayer is the number of layers in the model.
			
 
				-	NumLayer uint32
			
 
				-	NumRot   uint32
			
 
				-
			
 
				-	// FileType describes the quantization level of the model, e.g. Q4_0, Q5_K, etc.
			
 
				-	FileType uint32
			
 
				-}
			
 
				-
			
 
				 type Running struct {
			
 
				 	Port          int
			
 
				 	Cmd           *exec.Cmd
			
 
				 	Cancel        context.CancelFunc
			
 
				-	exitOnce      sync.Once
			
 
				-	exitCh        chan error // channel to receive the exit status of the subprocess
			
 
				-	*StatusWriter            // captures error messages from the llama runner process
			
 
				+	*StatusWriter // captures error messages from the llama runner process
			
 
				 }
			
 
				 
			
 
				 type ImageData struct {
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -7,10 +7,7 @@ import (
 
				 	"os"
			
 
				 	"runtime"
			
 
				 
			
 
				-	"github.com/pbnjay/memory"
			
 
				-
			
 
				 	"github.com/jmorganca/ollama/api"
			
 
				-	"github.com/jmorganca/ollama/format"
			
 
				 	"github.com/jmorganca/ollama/gpu"
			
 
				 )
			
 
				 
			
@@ -40,32 +37,89 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 		return nil, err
			
 
				 	}
			
 
				 
			
 
				-	if runtime.GOOS == "darwin" {
			
 
				-		var requiredMemory int64
			
 
				-		var f16Multiplier int64 = 2
			
 
				-
			
 
				-		switch ggml.ModelType() {
			
 
				-		case "3B", "7B":
			
 
				-			requiredMemory = 8 * format.GigaByte
			
 
				-		case "13B":
			
 
				-			requiredMemory = 16 * format.GigaByte
			
 
				-		case "30B", "34B", "40B":
			
 
				-			requiredMemory = 32 * format.GigaByte
			
 
				-		case "47B":
			
 
				-			requiredMemory = 48 * format.GigaByte
			
 
				-		case "65B", "70B":
			
 
				-			requiredMemory = 64 * format.GigaByte
			
 
				-		case "180B":
			
 
				-			requiredMemory = 128 * format.GigaByte
			
 
				-			f16Multiplier = 4
			
 
				-		}
			
 
				+	if opts.NumCtx < 4 {
			
 
				+		opts.NumCtx = 4
			
 
				+	}
			
 
				+
			
 
				+	fmt.Println("size", ggml.Size)
			
 
				+	fmt.Println("filetype", ggml.FileType())
			
 
				+	fmt.Println("architecture", ggml.ModelFamily())
			
 
				+	fmt.Println("type", ggml.ModelType())
			
 
				+	fmt.Println("name", ggml.Name())
			
 
				+	fmt.Println("embd", ggml.NumEmbed())
			
 
				+	fmt.Println("head", ggml.NumHead())
			
 
				+	fmt.Println("head_kv", ggml.NumHeadKv())
			
 
				+	fmt.Println("gqa", ggml.NumGQA())
			
 
				+
			
 
				+	available, _ := gpu.CheckVRAM()
			
 
				+
			
 
				+	// For now assume filesize = model size
			
 
				+	// TODO: use actual model size
			
 
				+	requiredModel := ggml.Size
			
 
				+
			
 
				+	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
			
 
				+	requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
			
 
				 
			
 
				-		systemMemory := int64(memory.TotalMemory())
			
 
				+	// this amount is the overhead + tensors in memory
			
 
				+	// TODO: get this from the llama.cpp's graph calcluations instead of
			
 
				+	// guessing it's ~1/7th of the kv cache times gqa
			
 
				+	requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 7
			
 
				 
			
 
				-		if ggml.FileType() == "F16" && requiredMemory*f16Multiplier > systemMemory {
			
 
				-			return nil, fmt.Errorf("F16 model requires at least %s of memory", format.HumanBytes(requiredMemory))
			
 
				-		} else if requiredMemory > systemMemory {
			
 
				-			return nil, fmt.Errorf("model requires at least %s of memory", format.HumanBytes(requiredMemory))
			
 
				+	requiredTotal := requiredModel + requiredKv + requiredAlloc
			
 
				+
			
 
				+	log.Println("system memory bytes:", available)
			
 
				+	log.Println("required model bytes:", requiredModel)
			
 
				+	log.Println("required kv bytes:", requiredKv)
			
 
				+	log.Println("required alloc bytes:", requiredAlloc)
			
 
				+	log.Println("required total bytes:", requiredTotal)
			
 
				+
			
 
				+	info := gpu.GetGPUInfo()
			
 
				+	library := info.Library
			
 
				+
			
 
				+	if opts.NumGPU == -1 {
			
 
				+		// default to offloading all layers
			
 
				+		opts.NumGPU = int(ggml.NumLayers()) + 1
			
 
				+	}
			
 
				+
			
 
				+	// decide how many layers to put on the GPU
			
 
				+	if opts.NumGPU > 0 {
			
 
				+		switch runtime.GOOS {
			
 
				+		case "darwin":
			
 
				+			if requiredTotal > available {
			
 
				+				log.Println("not enough vram available, falling back to CPU only")
			
 
				+				opts.NumGPU = 0
			
 
				+			}
			
 
				+		default:
			
 
				+			if library == "cpu" || library == "default" {
			
 
				+				opts.NumGPU = 0
			
 
				+				break
			
 
				+			}
			
 
				+
			
 
				+			// no offloading required
			
 
				+			if requiredTotal <= available {
			
 
				+				break
			
 
				+			}
			
 
				+
			
 
				+			// This handles two cases:
			
 
				+			// 1. overhead + tensors are always loaded into scratch memory even with num_gpu 0
			
 
				+			// 2. it seems llama.cpp always tries to allocate the entire kv cache (even if later split into layers) into vram or crashes
			
 
				+			if requiredAlloc > available || requiredKv > available {
			
 
				+				log.Printf("not enough vram available, falling back to CPU only")
			
 
				+				library = "cpu"
			
 
				+				opts.NumGPU = 0
			
 
				+				break
			
 
				+			}
			
 
				+
			
 
				+			available -= requiredAlloc
			
 
				+
			
 
				+			// fill remaining vram with layers
			
 
				+			log.Println("splitting", available, "of available memory bytes into layers")
			
 
				+			bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers()))
			
 
				+			log.Println("bytes per layer:", bytesPerLayer)
			
 
				+			layers := available / bytesPerLayer
			
 
				+			if layers < int64(opts.NumGPU) {
			
 
				+				opts.NumGPU = int(layers)
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -73,7 +127,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
 
				 	opts.RopeFrequencyBase = 0.0
			
 
				 	opts.RopeFrequencyScale = 0.0
			
 
				 	gpuInfo := gpu.GetGPUInfo()
			
 
				-	return newLlmServer(gpuInfo.Library, model, adapters, projectors, ggml.NumLayers(), opts)
			
 
				+	return newLlmServer(gpuInfo.Library, model, adapters, projectors, opts)
			
 
				 }
			
 
				 
			
 
				 // Give any native cgo implementations an opportunity to initialize
			
@@ -81,9 +135,9 @@ func Init(workdir string) error {
 
				 	return nativeInit(workdir)
			
 
				 }
			
 
				 
			
 
				-func newLlmServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newLlmServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				 	if _, libPresent := AvailableShims[library]; libPresent && library != "default" {
			
 
				-		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, numLayers, opts)
			
 
				+		srv, err := newDynamicShimExtServer(AvailableShims[library], model, adapters, projectors, opts)
			
 
				 		if err == nil {
			
 
				 			return srv, nil
			
 
				 		}
			
@@ -91,6 +145,5 @@ func newLlmServer(library, model string, adapters, projectors []string, numLayer
 
				 		// TODO - update some state to indicate we were unable to load the GPU library for future "info" ux
			
 
				 	}
			
 
				 
			
 
				-	return newDefaultExtServer(model, adapters, projectors, numLayers, opts)
			
 
				-
			
 
				+	return newDefaultExtServer(model, adapters, projectors, opts)
			
 
				 }
			
--- a/llm/shim_darwin.go
+++ b/llm/shim_darwin.go
@@ -16,7 +16,7 @@ import (
 
				 //go:embed llama.cpp/ggml-metal.metal
			
 
				 var libEmbed embed.FS
			
 
				 
			
 
				-func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				 	// should never happen...
			
 
				 	return nil, fmt.Errorf("Dynamic library loading not supported on Mac")
			
 
				 }
			
--- a/llm/shim_ext_server.go
+++ b/llm/shim_ext_server.go
@@ -72,7 +72,7 @@ func (llm *shimExtServer) llama_server_release_json_resp(json_resp **C.char) {
 
				 	C.dynamic_shim_llama_server_release_json_resp(llm.s, json_resp)
			
 
				 }
			
 
				 
			
 
				-func newDynamicShimExtServer(library, model string, adapters, projectors []string, numLayers int64, opts api.Options) (extServer, error) {
			
 
				+func newDynamicShimExtServer(library, model string, adapters, projectors []string, opts api.Options) (extServer, error) {
			
 
				 	shimMutex.Lock()
			
 
				 	defer shimMutex.Unlock()
			
 
				 	updatePath(filepath.Dir(library))
			
@@ -90,7 +90,7 @@ func newDynamicShimExtServer(library, model string, adapters, projectors []strin
 
				 		options: opts,
			
 
				 	}
			
 
				 	log.Printf("Loading Dynamic Shim llm server: %s", library)
			
 
				-	return newExtServer(llm, model, adapters, projectors, numLayers, opts)
			
 
				+	return newExtServer(llm, model, adapters, projectors, opts)
			
 
				 }
			
 
				 
			
 
				 func (llm *shimExtServer) Predict(ctx context.Context, pred PredictOpts, fn func(PredictResult)) error {