пре 1 година · 91b3e4d282
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -6,11 +6,15 @@ import (
 
				 )
			
 
				 
			
 
				 const (
			
 
				-	Byte     = 1
			
 
				+	Byte = 1
			
 
				+
			
 
				 	KiloByte = Byte * 1000
			
 
				 	MegaByte = KiloByte * 1000
			
 
				 	GigaByte = MegaByte * 1000
			
 
				 	TeraByte = GigaByte * 1000
			
 
				+
			
 
				+	KibiByte = Byte * 1024
			
 
				+	MebiByte = KibiByte * 1024
			
 
				 )
			
 
				 
			
 
				 func HumanBytes(b int64) string {
			
@@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
 
				 		return fmt.Sprintf("%d %s", int(value), unit)
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+func HumanBytes2(b int64) string {
			
 
				+	switch {
			
 
				+	case b >= MebiByte:
			
 
				+		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
			
 
				+	case b >= KibiByte:
			
 
				+		return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
			
 
				+	default:
			
 
				+		return fmt.Sprintf("%d B", b)
			
 
				+	}
			
 
				+}
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -20,6 +20,8 @@ import (
 
				 	"strings"
			
 
				 	"sync"
			
 
				 	"unsafe"
			
 
				+
			
 
				+	"github.com/ollama/ollama/format"
			
 
				 )
			
 
				 
			
 
				 type handles struct {
			
@@ -27,6 +29,11 @@ type handles struct {
 
				 	cudart *C.cudart_handle_t
			
 
				 }
			
 
				 
			
 
				+const (
			
 
				+	cudaMinimumMemory = 377 * format.MebiByte
			
 
				+	rocmMinimumMemory = 377 * format.MebiByte
			
 
				+)
			
 
				+
			
 
				 var gpuMutex sync.Mutex
			
 
				 var gpuHandles *handles = nil
			
 
				 
			
@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
 
				 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
			
 
				 				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 				resp.Library = "cuda"
			
 
				+				resp.MinimumMemory = cudaMinimumMemory
			
 
				 			} else {
			
 
				 				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 			}
			
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
 
				 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
			
 
				 				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 				resp.Library = "cuda"
			
 
				+				resp.MinimumMemory = cudaMinimumMemory
			
 
				 			} else {
			
 
				 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
			
 
				 			}
			
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
 
				 	} else {
			
 
				 		AMDGetGPUInfo(&resp)
			
 
				 		if resp.Library != "" {
			
 
				+			resp.MinimumMemory = rocmMinimumMemory
			
 
				 			return resp
			
 
				 		}
			
 
				 	}
			
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
 
				 	}
			
 
				 	gpuInfo := GetGPUInfo()
			
 
				 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
			
 
				-		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
			
 
				-		overhead := gpuInfo.FreeMemory / 10
			
 
				-		gpus := uint64(gpuInfo.DeviceCount)
			
 
				-		if overhead < gpus*1024*1024*1024 {
			
 
				-			overhead = gpus * 1024 * 1024 * 1024
			
 
				-		}
			
 
				-		// Assigning full reported free memory for Tegras due to OS controlled caching.
			
 
				-		if CudaTegra != "" {
			
 
				-			// Setting overhead for non-Tegra devices
			
 
				-			overhead = 0
			
 
				-		}
			
 
				-		avail := int64(gpuInfo.FreeMemory - overhead)
			
 
				-		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
			
 
				-		return avail, nil
			
 
				+		return int64(gpuInfo.FreeMemory), nil
			
 
				 	}
			
 
				 
			
 
				 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -14,6 +14,9 @@ type GpuInfo struct {
 
				 	// Optional variant to select (e.g. versions, cpu feature flags)
			
 
				 	Variant string `json:"variant,omitempty"`
			
 
				 
			
 
				+	// MinimumMemory represents the minimum memory required to use the GPU
			
 
				+	MinimumMemory int64 `json:"-"`
			
 
				+
			
 
				 	// TODO add other useful attributes about the card here for discovery information
			
 
				 }
			
 
				 
			
--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -39,7 +39,7 @@ import (
 
				 
			
 
				 type dynExtServer struct {
			
 
				 	s       C.struct_dynamic_llama_server
			
 
				-	options api.Options
			
 
				+	options *api.Options
			
 
				 }
			
 
				 
			
 
				 // Note: current implementation does not support concurrent instantiations
			
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 
				 	return fmt.Errorf(C.GoString(resp.msg))
			
 
				 }
			
 
				 
			
 
				-func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				+func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				 	if !mutex.TryLock() {
			
 
				 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
			
 
				 		mutex.Lock()
			
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -5,6 +5,7 @@ import (
 
				 	"errors"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				+	"strings"
			
 
				 )
			
 
				 
			
 
				 type GGML struct {
			
@@ -12,6 +13,16 @@ type GGML struct {
 
				 	model
			
 
				 }
			
 
				 
			
 
				+func (ggml *GGML) LayerSize(prefix string) (n int64) {
			
 
				+	for _, t := range ggml.Tensors() {
			
 
				+		if strings.HasPrefix(t.Name, prefix) {
			
 
				+			n += int64(t.size())
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				 const (
			
 
				 	fileTypeF32 uint32 = iota
			
 
				 	fileTypeF16
			
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -5,10 +5,11 @@ import (
 
				 	"fmt"
			
 
				 	"log/slog"
			
 
				 	"os"
			
 
				-	"runtime"
			
 
				 	"slices"
			
 
				+	"strings"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/format"
			
 
				 	"github.com/ollama/ollama/gpu"
			
 
				 )
			
 
				 
			
@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
 
				 	"mamba",
			
 
				 }
			
 
				 
			
 
				-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				+func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				 	if _, err := os.Stat(model); err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 
				 	}
			
 
				 	defer f.Close()
			
 
				 
			
 
				-	ggml, size, err := DecodeGGML(f)
			
 
				+	ggml, _, err := DecodeGGML(f)
			
 
				 	if err != nil {
			
 
				 		return nil, err
			
 
				 	}
			
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 
				 		opts.NumCtx = 4
			
 
				 	}
			
 
				 
			
 
				-	vram, _ := gpu.CheckVRAM()
			
 
				+	availableMemory, _ := gpu.CheckVRAM()
			
 
				+	info := gpu.GetGPUInfo()
			
 
				+
			
 
				+	usedMemory := info.MinimumMemory
			
 
				+	for _, projector := range projectors {
			
 
				+		usedMemory += projectorMemoryRequirements(projector)
			
 
				+
			
 
				+		// multimodal models require at least 2048 context
			
 
				+		opts.NumCtx = max(opts.NumCtx, 2048)
			
 
				+	}
			
 
				 
			
 
				-	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
			
 
				-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
			
 
				+	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
			
 
				+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())
			
 
				 
			
 
				 	// this amount is the overhead + tensors in memory
			
 
				 	// TODO: get this from the llama.cpp's graph calculations instead of
			
 
				 	// estimating it's 1/6 * kv_cache_size * num_gqa
			
 
				 	graph := int64(ggml.KV().GQA()) * kv / 6
			
 
				+	usedMemory += graph
			
 
				 
			
 
				-	if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
			
 
				-		opts.NumGPU = 0
			
 
				+	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
			
 
				+		info.Library = "cpu"
			
 
				 	}
			
 
				 
			
 
				-	info := gpu.GetGPUInfo()
			
 
				-	switch runtime.GOOS {
			
 
				-	case "darwin":
			
 
				-		if opts.NumGPU == 0 {
			
 
				-			break
			
 
				-		}
			
 
				+	requiredMemory := usedMemory
			
 
				 
			
 
				-		if size+kv+graph > vram {
			
 
				-			slog.Info("not enough vram available, setting num_gpu=0")
			
 
				-			opts.NumGPU = 0
			
 
				-			break
			
 
				-		}
			
 
				+	var layers int
			
 
				+	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
			
 
				+		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
			
 
				+		requiredMemory += layerMemory
			
 
				 
			
 
				-		// TODO: implement layer splitting on macOS
			
 
				-		opts.NumGPU = 999
			
 
				-	default:
			
 
				-		if info.Library == "cpu" {
			
 
				-			slog.Info("GPU not available, falling back to CPU")
			
 
				-			opts.NumGPU = 0
			
 
				-			break
			
 
				+		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
			
 
				+			usedMemory += layerMemory
			
 
				+			layers++
			
 
				 		}
			
 
				+	}
			
 
				 
			
 
				-		// don't use GPU at all if no layers are loaded
			
 
				-		if opts.NumGPU == 0 {
			
 
				-			info.Library = "cpu"
			
 
				-			info.Variant = gpu.GetCPUVariant()
			
 
				-			break
			
 
				-		}
			
 
				+	memOutputLayer := ggml.LayerSize("output.")
			
 
				+	requiredMemory += memOutputLayer
			
 
				 
			
 
				-		// user-defined GPU count
			
 
				-		if opts.NumGPU != -1 {
			
 
				-			break
			
 
				-		}
			
 
				+	// only offload output layer if all repeating layers are offloaded
			
 
				+	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
			
 
				+		usedMemory += memOutputLayer
			
 
				+		layers++
			
 
				+	}
			
 
				 
			
 
				-		// the "main" GPU needs the most memory and determines the limit
			
 
				-		// of how many layers can be loaded. It needs to fit:
			
 
				-		// 1. the full compute graph allocation for all devices (graph)
			
 
				-		// 2. the proportional kv cache for all devices (kv * % layers)
			
 
				-		// 3. the proportional model (size * % layers / # devices)
			
 
				-		// This estimates the number of layers
			
 
				-		maxlayers := int64(ggml.KV().BlockCount()) + 1
			
 
				-		devices := int64(info.DeviceCount)
			
 
				-		avg := vram / devices
			
 
				-		layers := maxlayers * (avg - graph) / (kv + size/devices)
			
 
				-		if layers > maxlayers {
			
 
				-			layers = maxlayers
			
 
				-		}
			
 
				+	slog.Info(
			
 
				+		"offload to gpu",
			
 
				+		"layers", layers,
			
 
				+		"required", format.HumanBytes2(requiredMemory),
			
 
				+		"used", format.HumanBytes2(usedMemory),
			
 
				+		"available", format.HumanBytes2(availableMemory),
			
 
				+		"kv", format.HumanBytes2(kv),
			
 
				+		"graph", format.HumanBytes2(graph),
			
 
				+	)
			
 
				+
			
 
				+	if opts.NumGPU < 0 && info.Library != "cpu" {
			
 
				+		opts.NumGPU = layers
			
 
				+	}
			
 
				 
			
 
				-		// 1 + 2 must fit on the main gpu
			
 
				-		min := graph + kv*layers/maxlayers
			
 
				-		if layers <= 0 || min > avg {
			
 
				-			slog.Info("not enough vram available, falling back to CPU only")
			
 
				-			info.Library = "cpu"
			
 
				-			info.Variant = gpu.GetCPUVariant()
			
 
				-			opts.NumGPU = 0
			
 
				-			break
			
 
				-		}
			
 
				+	return newLlmServer(info, model, adapters, projectors, opts)
			
 
				+}
			
 
				 
			
 
				-		opts.NumGPU = int(layers)
			
 
				+func projectorMemoryRequirements(filename string) int64 {
			
 
				+	file, err := os.Open(filename)
			
 
				+	if err != nil {
			
 
				+		return 0
			
 
				 	}
			
 
				+	defer file.Close()
			
 
				 
			
 
				-	opts.RopeFrequencyBase = 0.0
			
 
				-	opts.RopeFrequencyScale = 0.0
			
 
				-	return newLlmServer(info, model, adapters, projectors, opts)
			
 
				+	ggml, _, err := DecodeGGML(file)
			
 
				+	if err != nil {
			
 
				+		return 0
			
 
				+	}
			
 
				+
			
 
				+	prefixes := make(map[string]struct{})
			
 
				+	for _, layer := range ggml.Tensors() {
			
 
				+		parts := strings.Split(layer.Name, ".")
			
 
				+		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
			
 
				+	}
			
 
				+
			
 
				+	var ask int64
			
 
				+	for prefix := range prefixes {
			
 
				+		ask += ggml.LayerSize(prefix)
			
 
				+	}
			
 
				+
			
 
				+	return ask
			
 
				 }
			
 
				 
			
 
				 // Give any native cgo implementations an opportunity to initialize
			
@@ -134,7 +144,7 @@ func Init() error {
 
				 	return nativeInit()
			
 
				 }
			
 
				 
			
 
				-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
			
 
				+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
			
 
				 	dynLibs := getDynLibs(gpuInfo)
			
 
				 
			
 
				 	// Check to see if the user has requested a specific library instead of auto-detecting
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -68,7 +68,7 @@ var loaded struct {
 
				 var defaultSessionDuration = 5 * time.Minute
			
 
				 
			
 
				 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
			
 
				-func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
			
 
				+func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
			
 
				 	needLoad := loaded.runner == nil || // is there a model loaded?
			
 
				 		loaded.ModelPath != model.ModelPath || // has the base model changed?
			
 
				 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
			
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
 
				 
			
 
				 		loaded.Model = model
			
 
				 		loaded.runner = llmRunner
			
 
				-		loaded.Options = &opts
			
 
				+		loaded.Options = opts
			
 
				 	}
			
 
				 
			
 
				 	loaded.expireAt = time.Now().Add(sessionDuration)
			
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
 
				 		sessionDuration = req.KeepAlive.Duration
			
 
				 	}
			
 
				 
			
 
				-	if err := load(c, model, opts, sessionDuration); err != nil {
			
 
				+	if err := load(c, model, &opts, sessionDuration); err != nil {
			
 
				 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
			
 
				 		return
			
 
				 	}
			
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
 
				 		sessionDuration = req.KeepAlive.Duration
			
 
				 	}
			
 
				 
			
 
				-	if err := load(c, model, opts, sessionDuration); err != nil {
			
 
				+	if err := load(c, model, &opts, sessionDuration); err != nil {
			
 
				 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
			
 
				 		return
			
 
				 	}
			
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
 
				 		sessionDuration = req.KeepAlive.Duration
			
 
				 	}
			
 
				 
			
 
				-	if err := load(c, model, opts, sessionDuration); err != nil {
			
 
				+	if err := load(c, model, &opts, sessionDuration); err != nil {
			
 
				 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
			
 
				 		return
			
 
				 	}