hai 7 meses · b05c9e83d9
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command {
 
				 				envVars["OLLAMA_TMPDIR"],
			
 
				 				envVars["OLLAMA_FLASH_ATTENTION"],
			
 
				 				envVars["OLLAMA_LLM_LIBRARY"],
			
 
				+				envVars["OLLAMA_GPU_OVERHEAD"],
			
 
				 			})
			
 
				 		default:
			
 
				 			appendEnvDocs(cmd, envs)
			
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -231,6 +231,25 @@ var (
 
				 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
			
 
				 )
			
 
				 
			
 
				+func Uint64(key string, defaultValue uint64) func() uint64 {
			
 
				+	return func() uint64 {
			
 
				+		if s := Var(key); s != "" {
			
 
				+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
			
 
				+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
			
 
				+			} else {
			
 
				+				return n
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		return defaultValue
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+var (
			
 
				+	// Set aside VRAM per GPU
			
 
				+	GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
			
 
				+)
			
 
				+
			
 
				 type EnvVar struct {
			
 
				 	Name        string
			
 
				 	Value       any
			
@@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar {
 
				 	ret := map[string]EnvVar{
			
 
				 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
			
 
				 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
			
 
				+		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
			
 
				 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
			
 
				 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
			
 
				 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,6 +7,7 @@ import (
 
				 	"strings"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				 	"github.com/ollama/ollama/gpu"
			
 
				 )
			
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	// Overflow that didn't fit into the GPU
			
 
				 	var overflow uint64
			
 
				 
			
 
				+	overhead := envconfig.GpuOverhead()
			
 
				 	availableList := make([]string, len(gpus))
			
 
				 	for i, gpu := range gpus {
			
 
				 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
			
@@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			gzo = gpuZeroOverhead
			
 
				 		}
			
 
				 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
			
 
				-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
			
 
				+		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
			
 
				 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
			
 
				 			continue
			
 
				 		}
			
@@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				 			g := gpusWithSpace[i%j]
			
 
				 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				-			if g.g.FreeMemory > used+layerSize {
			
 
				+			if (g.g.FreeMemory - overhead) > used+layerSize {
			
 
				 				gpuAllocations[g.i] += layerSize
			
 
				 				layerCounts[g.i]++
			
 
				 				layerCount++
			
@@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				 			g := gpusWithSpace[layerCount%j]
			
 
				 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				-			if g.g.FreeMemory > used+memoryLayerOutput {
			
 
				+			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
			
 
				 				gpuAllocations[g.i] += memoryLayerOutput
			
 
				 				layerCounts[g.i]++
			
 
				 				layerCount++
			
@@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 }
			
 
				 
			
 
				 func (m MemoryEstimate) log() {
			
 
				+	overhead := envconfig.GpuOverhead()
			
 
				 	slog.Info(
			
 
				 		"offload to "+m.inferenceLibrary,
			
 
				 		slog.Group(
			
@@ -323,6 +326,7 @@ func (m MemoryEstimate) log() {
 
				 			"memory",
			
 
				 			// memory available by GPU for offloading
			
 
				 			"available", m.availableList,
			
 
				+			"gpu_overhead", format.HumanBytes2(overhead),
			
 
				 			slog.Group(
			
 
				 				"required",
			
 
				 				// memory required for full offloading