11 月之前 · 6fd04ca922
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -27,7 +27,7 @@ const (
 
				 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
			
 
				 
			
 
				 	// Direct Rendering Manager sysfs location
			
 
				-	DRMDeviceDirGlob   = "/sys/class/drm/card[0-9]/device"
			
 
				+	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
			
 
				 	DRMTotalMemoryFile = "mem_info_vram_total"
			
 
				 	DRMUsedMemoryFile  = "mem_info_vram_used"
			
 
				 
			
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList {
 
				 			return GpuInfoList{cpus[0].GpuInfo}
			
 
				 		}
			
 
				 
			
 
				-		// TODO - implement
			
 
				-
			
 
				-		// TODO refine the discovery to only gather total memory
			
 
				-
			
 
				 		// On windows we bundle the nvidia library one level above the runner dir
			
 
				 		depPath := ""
			
 
				 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
			
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -44,14 +44,14 @@ type CPUInfo struct {
 
				 
			
 
				 type CudaGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	index int // device index
			
 
				+	index int // nolint: unused
			
 
				 }
			
 
				 type CudaGPUInfoList []CudaGPUInfo
			
 
				 
			
 
				 type RocmGPUInfo struct {
			
 
				 	GpuInfo
			
 
				-	usedFilepath string // linux
			
 
				-	index        int    // device index on windows
			
 
				+	usedFilepath string // nolint: unused
			
 
				+	index        int    // nolint: unused
			
 
				 }
			
 
				 type RocmGPUInfoList []RocmGPUInfo
			
 
				 
			
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 
				 		}
			
 
				 		resp = [2][]string{
			
 
				 			[]string{"sunlight"},
			
 
				-			[]string{"england", "english", "massachusetts", "pilgrims"},
			
 
				+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
			
 
				 		}
			
 
				 	)
			
 
				 	var wg sync.WaitGroup
			
@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) {
 
				 			}
			
 
				 		}(i)
			
 
				 	}
			
 
				+	go func() {
			
 
				+		for {
			
 
				+			time.Sleep(2 * time.Second)
			
 
				+			select {
			
 
				+			case <-ctx.Done():
			
 
				+				return
			
 
				+			default:
			
 
				+				models, err := client.ListRunning(ctx)
			
 
				+				if err != nil {
			
 
				+					slog.Warn("failed to list running models", "error", err)
			
 
				+					continue
			
 
				+				}
			
 
				+				for _, m := range models.Models {
			
 
				+					slog.Info("loaded model snapshot", "model", m)
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}()
			
 
				 	wg.Wait()
			
 
				 }
			
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,7 @@ import (
 
				 )
			
 
				 
			
 
				 func TestContextExhaustion(t *testing.T) {
			
 
				-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
			
 
				+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
			
 
				 	defer cancel()
			
 
				 	// Set up the test data
			
 
				 	req := api.GenerateRequest{
			
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 
				 		[][]string{
			
 
				 			[]string{"sunlight"},
			
 
				 			[]string{"soil", "organic", "earth", "black", "tan"},
			
 
				-			[]string{"england", "english", "massachusetts", "pilgrims"},
			
 
				+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
			
 
				 			[]string{"fourth", "july", "declaration", "independence"},
			
 
				 			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
			
 
				 		}
			
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 
				 
			
 
				 		partialOffload = 4 * batch * embedding
			
 
				 		partialOffload += max(
			
 
				+			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
			
 
				 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
			
 
				 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
			
 
				 		)
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,9 +3,10 @@ package llm
 
				 import (
			
 
				 	"fmt"
			
 
				 	"log/slog"
			
 
				+	"strconv"
			
 
				+	"strings"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				 	"github.com/ollama/ollama/gpu"
			
 
				 )
			
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
				 	var estimatedVRAM uint64
			
 
				 	for _, gpus := range allGpus.ByLibrary() {
			
 
				 		var layerCount int
			
 
				-		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
			
 
				 		if opts.NumGPU < 0 {
			
 
				 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
			
 
				 				return true, estimatedVRAM
			
@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
				 	return false, estimatedVRAM
			
 
				 }
			
 
				 
			
 
				+type MemoryEstimate struct {
			
 
				+	// How many layers we predict we can load
			
 
				+	Layers int
			
 
				+
			
 
				+	// The size of the graph which occupies the main GPU
			
 
				+	Graph uint64
			
 
				+
			
 
				+	// How much VRAM will be allocated given the number of layers we predict
			
 
				+	VRAMSize uint64
			
 
				+
			
 
				+	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
			
 
				+	TotalSize uint64
			
 
				+
			
 
				+	// For multi-GPU scenarios, this provides the tensor split parameter
			
 
				+	TensorSplit string
			
 
				+
			
 
				+	// For multi-GPU scenarios, this is the size in bytes per GPU
			
 
				+	GPUSizes []uint64
			
 
				+}
			
 
				+
			
 
				 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
			
 
				 // The GPUs provided must all be the same Library
			
 
				-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
			
 
				-	var memoryAvailable uint64
			
 
				-	for _, info := range gpus {
			
 
				-		memoryAvailable += info.FreeMemory
			
 
				-	}
			
 
				-	if envconfig.MaxVRAM > 0 {
			
 
				-		memoryAvailable = envconfig.MaxVRAM
			
 
				-	}
			
 
				+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
			
 
				+	// Graph size for a partial offload, applies to all GPUs
			
 
				+	var graphPartialOffload uint64
			
 
				+
			
 
				+	// Graph size when all layers are offloaded, applies to all GPUs
			
 
				+	var graphFullOffload uint64
			
 
				+
			
 
				+	// Final graph offload once we know full or partial
			
 
				+	var graphOffload uint64
			
 
				+
			
 
				+	// Projectors loaded into GPU0 only
			
 
				+	var projectorSize uint64
			
 
				+
			
 
				+	// Conditional output size on GPU 0
			
 
				+	var memoryLayerOutput uint64
			
 
				+	var includeOutput bool
			
 
				+
			
 
				+	// One extra layer as a pad for each GPU
			
 
				+	var layerBuffer uint64
			
 
				 
			
 
				-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
			
 
				+	// The sizes of the main layers
			
 
				+	var layerSizes []uint64
			
 
				 
			
 
				-	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
			
 
				-	memoryMinimum := gpus[0].MinimumMemory
			
 
				+	// The sum of all the layer sizes (just for logging)
			
 
				+	var memoryWeights uint64
			
 
				+
			
 
				+	// True if all the layers are loaded
			
 
				+	var fullyLoaded bool
			
 
				+
			
 
				+	// Overflow that didn't fit into the GPU
			
 
				+	var overflow uint64
			
 
				+
			
 
				+	availableList := make([]string, len(gpus))
			
 
				+	for i, gpu := range gpus {
			
 
				+		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
			
 
				+	}
			
 
				+	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
			
 
				 
			
 
				 	for _, projector := range projectors {
			
 
				-		memoryMinimum += projectorMemoryRequirements(projector)
			
 
				+		projectorSize += projectorMemoryRequirements(projector)
			
 
				 
			
 
				 		// multimodal models require at least 2048 context
			
 
				 		opts.NumCtx = max(opts.NumCtx, 2048)
			
@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	layers := ggml.Tensors().Layers()
			
 
				 	// add one layer worth of memory as a buffer
			
 
				 	if blk0, ok := layers["blk.0"]; ok {
			
 
				-		memoryMinimum += blk0.size()
			
 
				+		layerBuffer = blk0.size()
			
 
				 	}
			
 
				 
			
 
				 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
			
 
				 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
			
 
				 
			
 
				-	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
			
 
				+	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
			
 
				 	if graphPartialOffload == 0 {
			
 
				 		graphPartialOffload = ggml.KV().GQA() * kv / 6
			
 
				 	}
			
 
				-
			
 
				 	if graphFullOffload == 0 {
			
 
				 		graphFullOffload = graphPartialOffload
			
 
				 	}
			
 
				 
			
 
				-	graphFullOffload *= uint64(len(gpus))
			
 
				-	graphPartialOffload *= uint64(len(gpus))
			
 
				-
			
 
				 	// on metal there's no partial offload overhead
			
 
				 	if gpus[0].Library == "metal" {
			
 
				 		graphPartialOffload = graphFullOffload
			
 
				 	}
			
 
				 
			
 
				-	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
			
 
				-	memoryRequiredTotal := memoryMinimum + graphFullOffload
			
 
				-
			
 
				-	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
			
 
				-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
			
 
				-
			
 
				-	var memoryLayerOutput uint64
			
 
				 	if layer, ok := layers["output_norm"]; ok {
			
 
				 		memoryLayerOutput += layer.size()
			
 
				 	}
			
 
				-
			
 
				 	if layer, ok := layers["output"]; ok {
			
 
				 		memoryLayerOutput += layer.size()
			
 
				 	} else if layer, ok := layers["token_embd"]; ok {
			
@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	}
			
 
				 
			
 
				 	if gpus[0].Library == "metal" && opts.UseMMap {
			
 
				-		// memory is preallocated for output tensors
			
 
				-		memoryRequiredTotal += memoryLayerOutput
			
 
				-		memoryRequiredPartial += memoryLayerOutput
			
 
				+		includeOutput = true
			
 
				+	} else if gpus[0].Library != "metal" || !opts.UseMMap {
			
 
				+		includeOutput = true
			
 
				 	}
			
 
				 
			
 
				+	gpuZeroOverhead := projectorSize
			
 
				+	if includeOutput {
			
 
				+		gpuZeroOverhead += memoryLayerOutput
			
 
				+	}
			
 
				+
			
 
				+	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
			
 
				 	var layerCount int
			
 
				+	layerCounts := make([]int, len(gpus))
			
 
				+	gpuAllocations := make([]uint64, len(gpus))
			
 
				+	type gs struct {
			
 
				+		i int
			
 
				+		g *gpu.GpuInfo
			
 
				+	}
			
 
				+	gpusWithSpace := []gs{}
			
 
				+	for i := range gpus {
			
 
				+		var gzo uint64
			
 
				+		if len(gpusWithSpace) == 0 {
			
 
				+			gzo = gpuZeroOverhead
			
 
				+		}
			
 
				+		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
			
 
				+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
			
 
				+			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
			
 
				+			continue
			
 
				+		}
			
 
				+		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
			
 
				+		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
			
 
				+	}
			
 
				+
			
 
				+	var gpuZeroID int
			
 
				+	if len(gpusWithSpace) > 0 {
			
 
				+		gpuZeroID = gpusWithSpace[0].i
			
 
				+		gpuAllocations[gpuZeroID] += gpuZeroOverhead
			
 
				+	}
			
 
				+
			
 
				+	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
			
 
				 	for i := range int(ggml.KV().BlockCount()) {
			
 
				 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
			
 
				 			memoryLayer := blk.size()
			
 
				 
			
 
				 			// KV is proportional to the number of layers
			
 
				 			memoryLayer += kv / ggml.KV().BlockCount()
			
 
				+			layerSizes[i] = memoryLayer
			
 
				+			memoryWeights += memoryLayer
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				-			memoryRequiredTotal += memoryLayer
			
 
				-			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
			
 
				-				memoryRequiredPartial += memoryLayer
			
 
				+	// For all the layers, find where they can fit on the GPU(s)
			
 
				+	for i := range layerSizes {
			
 
				+		if layerSizes[i] == 0 {
			
 
				+			continue
			
 
				+		}
			
 
				+		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
			
 
				+			// Stop allocating on GPU(s) once we hit the users target NumGPU
			
 
				+			continue
			
 
				+		}
			
 
				+
			
 
				+		// distribute the layers across the GPU(s) that have space
			
 
				+		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				+			g := gpusWithSpace[i%j]
			
 
				+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				+			if g.g.FreeMemory > used+layerSizes[i] {
			
 
				+				gpuAllocations[g.i] += layerSizes[i]
			
 
				+				layerCounts[g.i]++
			
 
				 				layerCount++
			
 
				+				break
			
 
				+			} else {
			
 
				+				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
			
 
				 			}
			
 
				 		}
			
 
				+
			
 
				+	}
			
 
				+	if layerCount >= int(ggml.KV().BlockCount()) {
			
 
				+		fullyLoaded = true
			
 
				+	} else {
			
 
				+		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
			
 
				+			overflow += layerSizes[i]
			
 
				+		}
			
 
				+	}
			
 
				+	// Find where the output fits
			
 
				+	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
			
 
				+		for j := len(gpusWithSpace); j > 0; j-- {
			
 
				+			g := gpusWithSpace[layerCount%j]
			
 
				+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
			
 
				+			if g.g.FreeMemory > used+memoryLayerOutput {
			
 
				+				gpuAllocations[g.i] += memoryLayerOutput
			
 
				+				layerCounts[g.i]++
			
 
				+				layerCount++
			
 
				+				break
			
 
				+			}
			
 
				+		}
			
 
				+		if layerCount < int(ggml.KV().BlockCount())+1 {
			
 
				+			fullyLoaded = false
			
 
				+			overflow += memoryLayerOutput
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	if gpus[0].Library != "metal" || !opts.UseMMap {
			
 
				-		// memory was not preallocated for output tensors
			
 
				-		memoryRequiredTotal += memoryLayerOutput
			
 
				+	// Add the applicable (full or partial) graph allocations
			
 
				+	for i := range gpus {
			
 
				+		if layerCounts[i] <= 0 {
			
 
				+			continue
			
 
				+		}
			
 
				+		if fullyLoaded {
			
 
				+			gpuAllocations[i] += graphFullOffload
			
 
				+		} else {
			
 
				+			gpuAllocations[i] += graphPartialOffload
			
 
				+		}
			
 
				+	}
			
 
				+	if fullyLoaded {
			
 
				+		graphOffload = graphFullOffload
			
 
				+	} else {
			
 
				+		graphOffload = graphPartialOffload
			
 
				 	}
			
 
				 
			
 
				-	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
			
 
				-		layerCount = int(ggml.KV().BlockCount()) + 1
			
 
				-		memoryRequiredPartial = memoryRequiredTotal
			
 
				+	// Summaries for the log
			
 
				+	var memoryRequiredPartial, memoryRequiredTotal uint64
			
 
				+	for i := range gpuAllocations {
			
 
				+		memoryRequiredPartial += gpuAllocations[i]
			
 
				+
			
 
				 	}
			
 
				+	memoryRequiredTotal = memoryRequiredPartial + overflow
			
 
				 
			
 
				-	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
			
 
				+	tensorSplit := ""
			
 
				+	if len(gpus) > 1 {
			
 
				+		splits := make([]string, len(gpus))
			
 
				+		for i, count := range layerCounts {
			
 
				+			splits[i] = strconv.Itoa(count)
			
 
				+		}
			
 
				+		tensorSplit = strings.Join(splits, ",")
			
 
				+	}
			
 
				+	allocationsList := []string{}
			
 
				+	for _, a := range gpuAllocations {
			
 
				+		allocationsList = append(allocationsList, format.HumanBytes2(a))
			
 
				+	}
			
 
				 
			
 
				 	slog.Info(
			
 
				 		"offload to gpu",
			
@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 			"layers",
			
 
				 			// requested number of layers to offload
			
 
				 			"requested", opts.NumGPU,
			
 
				+			// The number of layers the model has (including output)
			
 
				+			"model", int(ggml.KV().BlockCount())+1,
			
 
				 			// estimated number of layers that can be offloaded
			
 
				-			"real", layerCount,
			
 
				+			"offload", layerCount,
			
 
				+			// multi-gpu split for tesnors
			
 
				+			"split", tensorSplit,
			
 
				 		),
			
 
				 		slog.Group(
			
 
				 			"memory",
			
 
				-			// memory available for offloading
			
 
				-			"available", format.HumanBytes2(memoryAvailable),
			
 
				+			// memory available by GPU for offloading
			
 
				+			"available", availableList,
			
 
				 			slog.Group(
			
 
				 				"required",
			
 
				 				// memory required for full offloading
			
@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 				"partial", format.HumanBytes2(memoryRequiredPartial),
			
 
				 				// memory of KV cache
			
 
				 				"kv", format.HumanBytes2(kv),
			
 
				+				// Allocations across the GPUs
			
 
				+				"allocations", allocationsList,
			
 
				 			),
			
 
				 			slog.Group(
			
 
				 				"weights",
			
@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 		),
			
 
				 	)
			
 
				 	if gpus[0].Library == "cpu" {
			
 
				-		return 0, 0, memoryRequiredTotal
			
 
				+		return MemoryEstimate{
			
 
				+			Layers:    0,
			
 
				+			Graph:     0,
			
 
				+			VRAMSize:  0,
			
 
				+			TotalSize: memoryRequiredTotal,
			
 
				+			GPUSizes:  []uint64{},
			
 
				+		}
			
 
				 	}
			
 
				-	if memoryRequiredPartial > memoryAvailable {
			
 
				+	if layerCount == 0 {
			
 
				 		slog.Debug("insufficient VRAM to load any model layers")
			
 
				-		return 0, 0, memoryRequiredTotal
			
 
				+		return MemoryEstimate{
			
 
				+			Layers:    0,
			
 
				+			Graph:     0,
			
 
				+			VRAMSize:  0,
			
 
				+			TotalSize: memoryRequiredTotal,
			
 
				+			GPUSizes:  []uint64{},
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				-	return layerCount, memoryRequiredPartial, memoryRequiredTotal
			
 
				+	return MemoryEstimate{
			
 
				+		Layers:      layerCount,
			
 
				+		Graph:       graphOffload,
			
 
				+		VRAMSize:    memoryRequiredPartial,
			
 
				+		TotalSize:   memoryRequiredTotal,
			
 
				+		TensorSplit: tensorSplit,
			
 
				+		GPUSizes:    gpuAllocations,
			
 
				+	}
			
 
				 }
			
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -0,0 +1,116 @@
 
				+package llm
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"encoding/binary"
			
 
				+	"fmt"
			
 
				+	"os"
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/envconfig"
			
 
				+	"github.com/ollama/ollama/gpu"
			
 
				+	"github.com/stretchr/testify/assert"
			
 
				+	"github.com/stretchr/testify/require"
			
 
				+)
			
 
				+
			
 
				+func TestEstimateGPULayers(t *testing.T) {
			
 
				+	envconfig.Debug = true
			
 
				+	modelName := "dummy"
			
 
				+	f, err := os.CreateTemp(t.TempDir(), modelName)
			
 
				+	assert.Nil(t, err)
			
 
				+	defer f.Close()
			
 
				+	gguf := NewGGUFV3(binary.LittleEndian)
			
 
				+	inputLayerCount := 5
			
 
				+	tensors := []Tensor{
			
 
				+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+	}
			
 
				+	assert.Equal(t, inputLayerCount+1, len(tensors))
			
 
				+	err = gguf.Encode(f, KV{
			
 
				+		"general.architecture":          "llama",
			
 
				+		"general.name":                  "name",
			
 
				+		"llama.context_length":          uint32(32),
			
 
				+		"llama.embedding_length":        uint32(4096),
			
 
				+		"llama.block_count":             uint32(inputLayerCount),
			
 
				+		"llama.attention.head_count":    uint32(32),
			
 
				+		"llama.attention.head_count_kv": uint32(32),
			
 
				+		"tokenizer.ggml.tokens":         []string{" "},
			
 
				+		"tokenizer.ggml.scores":         []float32{0},
			
 
				+		"tokenizer.ggml.token_type":     []int32{0},
			
 
				+	}, tensors)
			
 
				+	require.NoError(t, err)
			
 
				+
			
 
				+	ggml, err := LoadModel(f.Name())
			
 
				+	require.NoError(t, err)
			
 
				+
			
 
				+	// Simple CPU scenario
			
 
				+	gpus := []gpu.GpuInfo{
			
 
				+		{
			
 
				+			Library: "cpu",
			
 
				+		},
			
 
				+	}
			
 
				+	projectors := []string{}
			
 
				+	opts := api.DefaultOptions()
			
 
				+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+	assert.Equal(t, 0, estimate.Layers)
			
 
				+	assert.Equal(t, uint64(0), estimate.Graph)
			
 
				+
			
 
				+	// derived from the dummy ggml file above
			
 
				+	graphPartialOffload := uint64(202377216)
			
 
				+	graphFullOffload := uint64(171968512)
			
 
				+	layerSize := uint64(33554436)
			
 
				+	projectorSize := uint64(0)
			
 
				+	memoryLayerOutput := uint64(4)
			
 
				+
			
 
				+	// Dual CUDA scenario with assymetry
			
 
				+	gpuMinimumMemory := uint64(2048)
			
 
				+	gpus = []gpu.GpuInfo{
			
 
				+		{
			
 
				+			Library:       "cuda",
			
 
				+			MinimumMemory: gpuMinimumMemory,
			
 
				+		},
			
 
				+		{
			
 
				+			Library:       "cuda",
			
 
				+			MinimumMemory: gpuMinimumMemory,
			
 
				+		},
			
 
				+	}
			
 
				+	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
			
 
				+	for i, s := range [][]uint64{
			
 
				+		{1, 1, 1, 1},
			
 
				+		{2, 1, 2, 1},
			
 
				+		{2, 2, 2, 2},
			
 
				+		{1, 2, 1, 2},
			
 
				+		{3, 3, 3, 3},
			
 
				+		{4, 4, 3, 3},
			
 
				+		{6, 6, 3, 3},
			
 
				+		{0, 3, 0, 3},
			
 
				+	} {
			
 
				+		gpus[0].FreeMemory = 0
			
 
				+		gpus[1].FreeMemory = 0
			
 
				+		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
			
 
				+		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
			
 
				+		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
			
 
				+		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				+		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
			
 
				+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
			
 
				+		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
			
 
				+		var layerSums uint64
			
 
				+		for _, b := range estimate.GPUSizes {
			
 
				+			layerSums += b
			
 
				+		}
			
 
				+		if estimate.Layers < inputLayerCount+1 {
			
 
				+			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				+			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				+		} else {
			
 
				+			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
			
 
				+			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+}
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,13 +49,11 @@ type llmServer struct {
 
				 	status  *StatusWriter
			
 
				 	options api.Options
			
 
				 
			
 
				-	// TODO - this should be broken down by GPU
			
 
				-	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
			
 
				-	estimatedTotal uint64 // Total size of model
			
 
				-	totalLayers    uint64
			
 
				-	gpuCount       int
			
 
				-	loadDuration   time.Duration // Record how long it took the model to load
			
 
				-	loadProgress   float32
			
 
				+	estimate     MemoryEstimate
			
 
				+	totalLayers  uint64
			
 
				+	gpuCount     int
			
 
				+	loadDuration time.Duration // Record how long it took the model to load
			
 
				+	loadProgress float32
			
 
				 
			
 
				 	sem *semaphore.Weighted
			
 
				 }
			
@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) {
 
				 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
			
 
				 	var err error
			
 
				 	var cpuRunner string
			
 
				-	var estimatedVRAM uint64
			
 
				-	var estimatedTotal uint64
			
 
				+	var estimate MemoryEstimate
			
 
				 	var systemMemory uint64
			
 
				 	gpuCount := len(gpus)
			
 
				 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
			
@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 		cpuRunner = serverForCpu()
			
 
				 		gpuCount = 0
			
 
				-		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				 	} else {
			
 
				 		if gpus[0].Library == "metal" {
			
 
				 			memInfo, err := gpu.GetCPUMem()
			
@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
			
 
				 			}
			
 
				 		}
			
 
				-		var layers int
			
 
				-		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
			
 
				 
			
 
				 		switch {
			
 
				-		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
			
 
				+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
			
 
				 			// disable partial offloading when model is greater than total system memory as this
			
 
				 			// can lead to locking up the system
			
 
				 			opts.NumGPU = 0
			
 
				-		case gpus[0].Library != "metal" && layers == 0:
			
 
				+		case gpus[0].Library != "metal" && estimate.Layers == 0:
			
 
				 			// Don't bother loading into the GPU if no layers can fit
			
 
				 			cpuRunner = serverForCpu()
			
 
				 			gpuCount = 0
			
 
				-		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
			
 
				-			opts.NumGPU = layers
			
 
				+		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
			
 
				+			opts.NumGPU = estimate.Layers
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
			
 
				 
			
 
				+	if estimate.TensorSplit != "" {
			
 
				+		params = append(params, "--tensor-split", estimate.TensorSplit)
			
 
				+	}
			
 
				+
			
 
				+	if estimate.TensorSplit != "" {
			
 
				+		params = append(params, "--tensor-split", estimate.TensorSplit)
			
 
				+	}
			
 
				+
			
 
				 	for i := range len(servers) {
			
 
				 		dir := availableServers[servers[i]]
			
 
				 		if dir == "" {
			
@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		}
			
 
				 
			
 
				 		s := &llmServer{
			
 
				-			port:           port,
			
 
				-			cmd:            exec.Command(server, finalParams...),
			
 
				-			status:         NewStatusWriter(os.Stderr),
			
 
				-			options:        opts,
			
 
				-			estimatedVRAM:  estimatedVRAM,
			
 
				-			estimatedTotal: estimatedTotal,
			
 
				-			sem:            semaphore.NewWeighted(int64(numParallel)),
			
 
				-			totalLayers:    ggml.KV().BlockCount() + 1,
			
 
				-			gpuCount:       gpuCount,
			
 
				-			done:           make(chan error, 1),
			
 
				+			port:        port,
			
 
				+			cmd:         exec.Command(server, finalParams...),
			
 
				+			status:      NewStatusWriter(os.Stderr),
			
 
				+			options:     opts,
			
 
				+			estimate:    estimate,
			
 
				+			sem:         semaphore.NewWeighted(int64(numParallel)),
			
 
				+			totalLayers: ggml.KV().BlockCount() + 1,
			
 
				+			gpuCount:    gpuCount,
			
 
				+			done:        make(chan error, 1),
			
 
				 		}
			
 
				 
			
 
				 		s.cmd.Env = os.Environ()
			
@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error {
 
				 }
			
 
				 
			
 
				 func (s *llmServer) EstimatedVRAM() uint64 {
			
 
				-	return s.estimatedVRAM
			
 
				+	return s.estimate.VRAMSize
			
 
				 }
			
 
				 
			
 
				 func (s *llmServer) EstimatedTotal() uint64 {
			
 
				-	return s.estimatedTotal
			
 
				+	return s.estimate.TotalSize
			
 
				 }
			
 
				 
			
 
				 func parseDurationMs(ms float64) time.Duration {
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 
				 		"tokenizer.ggml.token_type":     []int32{0},
			
 
				 	}, []llm.Tensor{
			
 
				 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
			
 
				 	})
			
 
				 	require.NoError(t, err)