7 ماه پیش · b05c9e83d9
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command {
 
															 				envVars["OLLAMA_TMPDIR"],
														
 
															 				envVars["OLLAMA_FLASH_ATTENTION"],
														
 
															 				envVars["OLLAMA_LLM_LIBRARY"],
														
 
															+				envVars["OLLAMA_GPU_OVERHEAD"],
														
 
															 			})
														
 
															 		default:
														
 
															 			appendEnvDocs(cmd, envs)
														
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -231,6 +231,25 @@ var (
 
															 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
														
 
															 )
														
 
															+func Uint64(key string, defaultValue uint64) func() uint64 {
														
 
															+	return func() uint64 {
														
 
															+		if s := Var(key); s != "" {
														
 
															+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
														
 
															+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
														
 
															+			} else {
														
 
															+				return n
														
 
															+			}
														
 
															+		}
														
 
															+
														
 
															+		return defaultValue
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+var (
														
 
															+	// Set aside VRAM per GPU
														
 
															+	GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
														
 
															+)
														
 
															+
														
 
															 type EnvVar struct {
														
 
															 	Name        string
														
 
															 	Value       any
														
@@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar {
 
															 	ret := map[string]EnvVar{
														
 
															 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
														
 
															 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
														
 
															+		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
														
 
															 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
														
 
															 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
														
 
															 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
														
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,6 +7,7 @@ import (
 
															 	"strings"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															+	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															 	"github.com/ollama/ollama/gpu"
														
 
															 )
														
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 	// Overflow that didn't fit into the GPU
														
 
															 	var overflow uint64
														
 
															+	overhead := envconfig.GpuOverhead()
														
 
															 	availableList := make([]string, len(gpus))
														
 
															 	for i, gpu := range gpus {
														
 
															 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
														
@@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 			gzo = gpuZeroOverhead
														
 
															 		}
														
 
															 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
														
 
															-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
														
 
															+		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
														
 
															 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
														
 
															 			continue
														
 
															 		}
														
@@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 		for j := len(gpusWithSpace); j > 0; j-- {
														
 
															 			g := gpusWithSpace[i%j]
														
 
															 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
														
 
															-			if g.g.FreeMemory > used+layerSize {
														
 
															+			if (g.g.FreeMemory - overhead) > used+layerSize {
														
 
															 				gpuAllocations[g.i] += layerSize
														
 
															 				layerCounts[g.i]++
														
 
															 				layerCount++
														
@@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 		for j := len(gpusWithSpace); j > 0; j-- {
														
 
															 			g := gpusWithSpace[layerCount%j]
														
 
															 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
														
 
															-			if g.g.FreeMemory > used+memoryLayerOutput {
														
 
															+			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
														
 
															 				gpuAllocations[g.i] += memoryLayerOutput
														
 
															 				layerCounts[g.i]++
														
 
															 				layerCount++
														
@@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 }
														
 
															 func (m MemoryEstimate) log() {
														
 
															+	overhead := envconfig.GpuOverhead()
														
 
															 	slog.Info(
														
 
															 		"offload to "+m.inferenceLibrary,
														
 
															 		slog.Group(
														
@@ -323,6 +326,7 @@ func (m MemoryEstimate) log() {
 
															 			"memory",
														
 
															 			// memory available by GPU for offloading
														
 
															 			"available", m.availableList,
														
 
															+			"gpu_overhead", format.HumanBytes2(overhead),
														
 
															 			slog.Group(
														
 
															 				"required",
														
 
															 				// memory required for full offloading