فهرست منبع

Merge pull request #4215 from ollama/mxyng/mem

llm: add minimum based on layer size
Michael Yang 1 سال پیش
والد
کامیت
70edb9bc4d
3فایلهای تغییر یافته به همراه7 افزوده شده و 7 حذف شده
  1. 2 2
      gpu/gpu.go
  2. 1 1
      gpu/gpu_darwin.go
  3. 4 4
      llm/memory.go

+ 2 - 2
gpu/gpu.go

@@ -31,8 +31,8 @@ type handles struct {
 }
 }
 
 
 const (
 const (
-	cudaMinimumMemory = 457 * format.MebiByte
-	rocmMinimumMemory = 457 * format.MebiByte
+	cudaMinimumMemory = 256 * format.MebiByte
+	rocmMinimumMemory = 256 * format.MebiByte
 )
 )
 
 
 var gpuMutex sync.Mutex
 var gpuMutex sync.Mutex

+ 1 - 1
gpu/gpu_darwin.go

@@ -15,7 +15,7 @@ import (
 )
 )
 
 
 const (
 const (
-	metalMinimumMemory = 512 * format.MebiByte
+	metalMinimumMemory = 384 * format.MebiByte
 )
 )
 
 
 func GetGPUInfo() GpuInfoList {
 func GetGPUInfo() GpuInfoList {

+ 4 - 4
llm/memory.go

@@ -85,19 +85,19 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		graphPartialOffload = graphFullOffload
 		graphPartialOffload = graphFullOffload
 	}
 	}
 
 
+	layers := ggml.Tensors().Layers()
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
-	memoryRequiredTotal := memoryMinimum + graphFullOffload
+	memoryRequiredTotal := memoryMinimum + graphFullOffload + layers["blk.0"].size()
 
 
 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
 	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
+	memoryRequiredPartial := memoryMinimum + graphPartialOffload + layers["blk.0"].size()
 
 
 	if memoryRequiredPartial > memoryAvailable {
 	if memoryRequiredPartial > memoryAvailable {
 		slog.Debug("insufficient VRAM to load any model layers")
 		slog.Debug("insufficient VRAM to load any model layers")
 		return 0, 0
 		return 0, 0
 	}
 	}
 
 
-	layers := ggml.Tensors().Layers()
-
 	var memoryLayerOutput uint64
 	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
 		memoryLayerOutput += layer.size()