1 year ago · 7bb7cb8a60
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -5,7 +5,6 @@ import (
 
															 	"log/slog"
														
 
															 	"os"
														
 
															 	"strconv"
														
 
															-	"strings"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															 	"github.com/ollama/ollama/format"
														
@@ -100,8 +99,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 		return 0, 0
														
 
															 	}
														
 
															-	var layerCount int
														
 
															 	layers := ggml.Tensors().Layers()
														
 
															+
														
 
															+	var memoryLayerOutput uint64
														
 
															+	for k, v := range layers {
														
 
															+		if k == "output" || k == "output_norm" {
														
 
															+			memoryLayerOutput += v.size()
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if gpus[0].Library == "metal" && opts.UseMMap {
														
 
															+		// memory is preallocated for output tensors
														
 
															+		memoryRequiredTotal += memoryLayerOutput
														
 
															+		memoryRequiredPartial += memoryLayerOutput
														
 
															+	}
														
 
															+
														
 
															+	var layerCount int
														
 
															 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
														
 
															 		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
														
@@ -115,15 +128,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 		}
														
 
															 	}
														
 
															-	var memoryLayerOutput uint64
														
 
															-	for k, v := range layers {
														
 
															-		if !strings.HasPrefix(k, "blk.") {
														
 
															-			memoryLayerOutput += v.size()
														
 
															-		}
														
 
															+	if gpus[0].Library != "metal" || !opts.UseMMap {
														
 
															+		// memory was not preallocated for output tensors
														
 
															+		memoryRequiredTotal += memoryLayerOutput
														
 
															 	}
														
 
															-	memoryRequiredTotal += memoryLayerOutput
														
 
															-
														
 
															 	if memoryAvailable > memoryRequiredTotal {
														
 
															 		layerCount = int(ggml.KV().BlockCount()) + 1
														
 
															 		memoryRequiredPartial = memoryRequiredTotal