il y a 11 mois · 6fd04ca922
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@@ -27,7 +27,7 @@ const (
 
															 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
														
 
															 	// Direct Rendering Manager sysfs location
														
 
															-	DRMDeviceDirGlob   = "/sys/class/drm/card[0-9]/device"
														
 
															+	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
														
 
															 	DRMTotalMemoryFile = "mem_info_vram_total"
														
 
															 	DRMUsedMemoryFile  = "mem_info_vram_used"
														
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList {
 
															 			return GpuInfoList{cpus[0].GpuInfo}
														
 
															 		}
														
 
															-		// TODO - implement
														
 
															-
														
 
															-		// TODO refine the discovery to only gather total memory
														
 
															-
														
 
															 		// On windows we bundle the nvidia library one level above the runner dir
														
 
															 		depPath := ""
														
 
															 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {
														
--- a/gpu/types.go
+++ b/gpu/types.go
@@ -44,14 +44,14 @@ type CPUInfo struct {
 
															 type CudaGPUInfo struct {
														
 
															 	GpuInfo
														
 
															-	index int // device index
														
 
															+	index int // nolint: unused
														
 
															 }
														
 
															 type CudaGPUInfoList []CudaGPUInfo
														
 
															 type RocmGPUInfo struct {
														
 
															 	GpuInfo
														
 
															-	usedFilepath string // linux
														
 
															-	index        int    // device index on windows
														
 
															+	usedFilepath string // nolint: unused
														
 
															+	index        int    // nolint: unused
														
 
															 }
														
 
															 type RocmGPUInfoList []RocmGPUInfo
														
--- a/integration/concurrency_test.go
+++ b/integration/concurrency_test.go
@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 
															 		}
														
 
															 		resp = [2][]string{
														
 
															 			[]string{"sunlight"},
														
 
															-			[]string{"england", "english", "massachusetts", "pilgrims"},
														
 
															+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
														
 
															 		}
														
 
															 	)
														
 
															 	var wg sync.WaitGroup
														
@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) {
 
															 			}
														
 
															 		}(i)
														
 
															 	}
														
 
															+	go func() {
														
 
															+		for {
														
 
															+			time.Sleep(2 * time.Second)
														
 
															+			select {
														
 
															+			case <-ctx.Done():
														
 
															+				return
														
 
															+			default:
														
 
															+				models, err := client.ListRunning(ctx)
														
 
															+				if err != nil {
														
 
															+					slog.Warn("failed to list running models", "error", err)
														
 
															+					continue
														
 
															+				}
														
 
															+				for _, m := range models.Models {
														
 
															+					slog.Info("loaded model snapshot", "model", m)
														
 
															+				}
														
 
															+			}
														
 
															+		}
														
 
															+	}()
														
 
															 	wg.Wait()
														
 
															 }
														
--- a/integration/context_test.go
+++ b/integration/context_test.go
@@ -11,7 +11,7 @@ import (
 
															 )
														
 
															 func TestContextExhaustion(t *testing.T) {
														
 
															-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
														
 
															+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
														
 
															 	defer cancel()
														
 
															 	// Set up the test data
														
 
															 	req := api.GenerateRequest{
														
--- a/integration/utils_test.go
+++ b/integration/utils_test.go
@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 
															 		[][]string{
														
 
															 			[]string{"sunlight"},
														
 
															 			[]string{"soil", "organic", "earth", "black", "tan"},
														
 
															-			[]string{"england", "english", "massachusetts", "pilgrims"},
														
 
															+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
														
 
															 			[]string{"fourth", "july", "declaration", "independence"},
														
 
															 			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
														
 
															 		}
														
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 
															 		partialOffload = 4 * batch * embedding
														
 
															 		partialOffload += max(
														
 
															+			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
														
 
															 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
														
 
															 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
														
 
															 		)
														
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -3,9 +3,10 @@ package llm
 
															 import (
														
 
															 	"fmt"
														
 
															 	"log/slog"
														
 
															+	"strconv"
														
 
															+	"strings"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															-	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															 	"github.com/ollama/ollama/gpu"
														
 
															 )
														
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
															 	var estimatedVRAM uint64
														
 
															 	for _, gpus := range allGpus.ByLibrary() {
														
 
															 		var layerCount int
														
 
															-		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
														
 
															 		if opts.NumGPU < 0 {
														
 
															 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
														
 
															 				return true, estimatedVRAM
														
@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 
															 	return false, estimatedVRAM
														
 
															 }
														
 
															+type MemoryEstimate struct {
														
 
															+	// How many layers we predict we can load
														
 
															+	Layers int
														
 
															+
														
 
															+	// The size of the graph which occupies the main GPU
														
 
															+	Graph uint64
														
 
															+
														
 
															+	// How much VRAM will be allocated given the number of layers we predict
														
 
															+	VRAMSize uint64
														
 
															+
														
 
															+	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
														
 
															+	TotalSize uint64
														
 
															+
														
 
															+	// For multi-GPU scenarios, this provides the tensor split parameter
														
 
															+	TensorSplit string
														
 
															+
														
 
															+	// For multi-GPU scenarios, this is the size in bytes per GPU
														
 
															+	GPUSizes []uint64
														
 
															+}
														
 
															+
														
 
															 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
														
 
															 // The GPUs provided must all be the same Library
														
 
															-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
														
 
															-	var memoryAvailable uint64
														
 
															-	for _, info := range gpus {
														
 
															-		memoryAvailable += info.FreeMemory
														
 
															-	}
														
 
															-	if envconfig.MaxVRAM > 0 {
														
 
															-		memoryAvailable = envconfig.MaxVRAM
														
 
															-	}
														
 
															+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
														
 
															+	// Graph size for a partial offload, applies to all GPUs
														
 
															+	var graphPartialOffload uint64
														
 
															+
														
 
															+	// Graph size when all layers are offloaded, applies to all GPUs
														
 
															+	var graphFullOffload uint64
														
 
															+
														
 
															+	// Final graph offload once we know full or partial
														
 
															+	var graphOffload uint64
														
 
															+
														
 
															+	// Projectors loaded into GPU0 only
														
 
															+	var projectorSize uint64
														
 
															+
														
 
															+	// Conditional output size on GPU 0
														
 
															+	var memoryLayerOutput uint64
														
 
															+	var includeOutput bool
														
 
															+
														
 
															+	// One extra layer as a pad for each GPU
														
 
															+	var layerBuffer uint64
														
 
															-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
														
 
															+	// The sizes of the main layers
														
 
															+	var layerSizes []uint64
														
 
															-	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
														
 
															-	memoryMinimum := gpus[0].MinimumMemory
														
 
															+	// The sum of all the layer sizes (just for logging)
														
 
															+	var memoryWeights uint64
														
 
															+
														
 
															+	// True if all the layers are loaded
														
 
															+	var fullyLoaded bool
														
 
															+
														
 
															+	// Overflow that didn't fit into the GPU
														
 
															+	var overflow uint64
														
 
															+
														
 
															+	availableList := make([]string, len(gpus))
														
 
															+	for i, gpu := range gpus {
														
 
															+		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
														
 
															+	}
														
 
															+	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
														
 
															 	for _, projector := range projectors {
														
 
															-		memoryMinimum += projectorMemoryRequirements(projector)
														
 
															+		projectorSize += projectorMemoryRequirements(projector)
														
 
															 		// multimodal models require at least 2048 context
														
 
															 		opts.NumCtx = max(opts.NumCtx, 2048)
														
@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 	layers := ggml.Tensors().Layers()
														
 
															 	// add one layer worth of memory as a buffer
														
 
															 	if blk0, ok := layers["blk.0"]; ok {
														
 
															-		memoryMinimum += blk0.size()
														
 
															+		layerBuffer = blk0.size()
														
 
															 	}
														
 
															 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
														
 
															 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
														
 
															-	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
														
 
															+	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
														
 
															 	if graphPartialOffload == 0 {
														
 
															 		graphPartialOffload = ggml.KV().GQA() * kv / 6
														
 
															 	}
														
 
															-
														
 
															 	if graphFullOffload == 0 {
														
 
															 		graphFullOffload = graphPartialOffload
														
 
															 	}
														
 
															-	graphFullOffload *= uint64(len(gpus))
														
 
															-	graphPartialOffload *= uint64(len(gpus))
														
 
															-
														
 
															 	// on metal there's no partial offload overhead
														
 
															 	if gpus[0].Library == "metal" {
														
 
															 		graphPartialOffload = graphFullOffload
														
 
															 	}
														
 
															-	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
														
 
															-	memoryRequiredTotal := memoryMinimum + graphFullOffload
														
 
															-
														
 
															-	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
														
 
															-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
														
 
															-
														
 
															-	var memoryLayerOutput uint64
														
 
															 	if layer, ok := layers["output_norm"]; ok {
														
 
															 		memoryLayerOutput += layer.size()
														
 
															 	}
														
 
															-
														
 
															 	if layer, ok := layers["output"]; ok {
														
 
															 		memoryLayerOutput += layer.size()
														
 
															 	} else if layer, ok := layers["token_embd"]; ok {
														
@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 	}
														
 
															 	if gpus[0].Library == "metal" && opts.UseMMap {
														
 
															-		// memory is preallocated for output tensors
														
 
															-		memoryRequiredTotal += memoryLayerOutput
														
 
															-		memoryRequiredPartial += memoryLayerOutput
														
 
															+		includeOutput = true
														
 
															+	} else if gpus[0].Library != "metal" || !opts.UseMMap {
														
 
															+		includeOutput = true
														
 
															 	}
														
 
															+	gpuZeroOverhead := projectorSize
														
 
															+	if includeOutput {
														
 
															+		gpuZeroOverhead += memoryLayerOutput
														
 
															+	}
														
 
															+
														
 
															+	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
														
 
															 	var layerCount int
														
 
															+	layerCounts := make([]int, len(gpus))
														
 
															+	gpuAllocations := make([]uint64, len(gpus))
														
 
															+	type gs struct {
														
 
															+		i int
														
 
															+		g *gpu.GpuInfo
														
 
															+	}
														
 
															+	gpusWithSpace := []gs{}
														
 
															+	for i := range gpus {
														
 
															+		var gzo uint64
														
 
															+		if len(gpusWithSpace) == 0 {
														
 
															+			gzo = gpuZeroOverhead
														
 
															+		}
														
 
															+		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
														
 
															+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
														
 
															+			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
														
 
															+			continue
														
 
															+		}
														
 
															+		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
														
 
															+		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
														
 
															+	}
														
 
															+
														
 
															+	var gpuZeroID int
														
 
															+	if len(gpusWithSpace) > 0 {
														
 
															+		gpuZeroID = gpusWithSpace[0].i
														
 
															+		gpuAllocations[gpuZeroID] += gpuZeroOverhead
														
 
															+	}
														
 
															+
														
 
															+	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
														
 
															 	for i := range int(ggml.KV().BlockCount()) {
														
 
															 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
														
 
															 			memoryLayer := blk.size()
														
 
															 			// KV is proportional to the number of layers
														
 
															 			memoryLayer += kv / ggml.KV().BlockCount()
														
 
															+			layerSizes[i] = memoryLayer
														
 
															+			memoryWeights += memoryLayer
														
 
															+		}
														
 
															+	}
														
 
															-			memoryRequiredTotal += memoryLayer
														
 
															-			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
														
 
															-				memoryRequiredPartial += memoryLayer
														
 
															+	// For all the layers, find where they can fit on the GPU(s)
														
 
															+	for i := range layerSizes {
														
 
															+		if layerSizes[i] == 0 {
														
 
															+			continue
														
 
															+		}
														
 
															+		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
														
 
															+			// Stop allocating on GPU(s) once we hit the users target NumGPU
														
 
															+			continue
														
 
															+		}
														
 
															+
														
 
															+		// distribute the layers across the GPU(s) that have space
														
 
															+		for j := len(gpusWithSpace); j > 0; j-- {
														
 
															+			g := gpusWithSpace[i%j]
														
 
															+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
														
 
															+			if g.g.FreeMemory > used+layerSizes[i] {
														
 
															+				gpuAllocations[g.i] += layerSizes[i]
														
 
															+				layerCounts[g.i]++
														
 
															 				layerCount++
														
 
															+				break
														
 
															+			} else {
														
 
															+				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
														
 
															 			}
														
 
															 		}
														
 
															+
														
 
															+	}
														
 
															+	if layerCount >= int(ggml.KV().BlockCount()) {
														
 
															+		fullyLoaded = true
														
 
															+	} else {
														
 
															+		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
														
 
															+			overflow += layerSizes[i]
														
 
															+		}
														
 
															+	}
														
 
															+	// Find where the output fits
														
 
															+	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
														
 
															+		for j := len(gpusWithSpace); j > 0; j-- {
														
 
															+			g := gpusWithSpace[layerCount%j]
														
 
															+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
														
 
															+			if g.g.FreeMemory > used+memoryLayerOutput {
														
 
															+				gpuAllocations[g.i] += memoryLayerOutput
														
 
															+				layerCounts[g.i]++
														
 
															+				layerCount++
														
 
															+				break
														
 
															+			}
														
 
															+		}
														
 
															+		if layerCount < int(ggml.KV().BlockCount())+1 {
														
 
															+			fullyLoaded = false
														
 
															+			overflow += memoryLayerOutput
														
 
															+		}
														
 
															 	}
														
 
															-	if gpus[0].Library != "metal" || !opts.UseMMap {
														
 
															-		// memory was not preallocated for output tensors
														
 
															-		memoryRequiredTotal += memoryLayerOutput
														
 
															+	// Add the applicable (full or partial) graph allocations
														
 
															+	for i := range gpus {
														
 
															+		if layerCounts[i] <= 0 {
														
 
															+			continue
														
 
															+		}
														
 
															+		if fullyLoaded {
														
 
															+			gpuAllocations[i] += graphFullOffload
														
 
															+		} else {
														
 
															+			gpuAllocations[i] += graphPartialOffload
														
 
															+		}
														
 
															+	}
														
 
															+	if fullyLoaded {
														
 
															+		graphOffload = graphFullOffload
														
 
															+	} else {
														
 
															+		graphOffload = graphPartialOffload
														
 
															 	}
														
 
															-	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
														
 
															-		layerCount = int(ggml.KV().BlockCount()) + 1
														
 
															-		memoryRequiredPartial = memoryRequiredTotal
														
 
															+	// Summaries for the log
														
 
															+	var memoryRequiredPartial, memoryRequiredTotal uint64
														
 
															+	for i := range gpuAllocations {
														
 
															+		memoryRequiredPartial += gpuAllocations[i]
														
 
															+
														
 
															 	}
														
 
															+	memoryRequiredTotal = memoryRequiredPartial + overflow
														
 
															-	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
														
 
															+	tensorSplit := ""
														
 
															+	if len(gpus) > 1 {
														
 
															+		splits := make([]string, len(gpus))
														
 
															+		for i, count := range layerCounts {
														
 
															+			splits[i] = strconv.Itoa(count)
														
 
															+		}
														
 
															+		tensorSplit = strings.Join(splits, ",")
														
 
															+	}
														
 
															+	allocationsList := []string{}
														
 
															+	for _, a := range gpuAllocations {
														
 
															+		allocationsList = append(allocationsList, format.HumanBytes2(a))
														
 
															+	}
														
 
															 	slog.Info(
														
 
															 		"offload to gpu",
														
@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 			"layers",
														
 
															 			// requested number of layers to offload
														
 
															 			"requested", opts.NumGPU,
														
 
															+			// The number of layers the model has (including output)
														
 
															+			"model", int(ggml.KV().BlockCount())+1,
														
 
															 			// estimated number of layers that can be offloaded
														
 
															-			"real", layerCount,
														
 
															+			"offload", layerCount,
														
 
															+			// multi-gpu split for tesnors
														
 
															+			"split", tensorSplit,
														
 
															 		),
														
 
															 		slog.Group(
														
 
															 			"memory",
														
 
															-			// memory available for offloading
														
 
															-			"available", format.HumanBytes2(memoryAvailable),
														
 
															+			// memory available by GPU for offloading
														
 
															+			"available", availableList,
														
 
															 			slog.Group(
														
 
															 				"required",
														
 
															 				// memory required for full offloading
														
@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 				"partial", format.HumanBytes2(memoryRequiredPartial),
														
 
															 				// memory of KV cache
														
 
															 				"kv", format.HumanBytes2(kv),
														
 
															+				// Allocations across the GPUs
														
 
															+				"allocations", allocationsList,
														
 
															 			),
														
 
															 			slog.Group(
														
 
															 				"weights",
														
@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 		),
														
 
															 	)
														
 
															 	if gpus[0].Library == "cpu" {
														
 
															-		return 0, 0, memoryRequiredTotal
														
 
															+		return MemoryEstimate{
														
 
															+			Layers:    0,
														
 
															+			Graph:     0,
														
 
															+			VRAMSize:  0,
														
 
															+			TotalSize: memoryRequiredTotal,
														
 
															+			GPUSizes:  []uint64{},
														
 
															+		}
														
 
															 	}
														
 
															-	if memoryRequiredPartial > memoryAvailable {
														
 
															+	if layerCount == 0 {
														
 
															 		slog.Debug("insufficient VRAM to load any model layers")
														
 
															-		return 0, 0, memoryRequiredTotal
														
 
															+		return MemoryEstimate{
														
 
															+			Layers:    0,
														
 
															+			Graph:     0,
														
 
															+			VRAMSize:  0,
														
 
															+			TotalSize: memoryRequiredTotal,
														
 
															+			GPUSizes:  []uint64{},
														
 
															+		}
														
 
															 	}
														
 
															-	return layerCount, memoryRequiredPartial, memoryRequiredTotal
														
 
															+	return MemoryEstimate{
														
 
															+		Layers:      layerCount,
														
 
															+		Graph:       graphOffload,
														
 
															+		VRAMSize:    memoryRequiredPartial,
														
 
															+		TotalSize:   memoryRequiredTotal,
														
 
															+		TensorSplit: tensorSplit,
														
 
															+		GPUSizes:    gpuAllocations,
														
 
															+	}
														
 
															 }
														
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -0,0 +1,116 @@
 
															+package llm
														
 
															+
														
 
															+import (
														
 
															+	"bytes"
														
 
															+	"encoding/binary"
														
 
															+	"fmt"
														
 
															+	"os"
														
 
															+	"testing"
														
 
															+
														
 
															+	"github.com/ollama/ollama/api"
														
 
															+	"github.com/ollama/ollama/envconfig"
														
 
															+	"github.com/ollama/ollama/gpu"
														
 
															+	"github.com/stretchr/testify/assert"
														
 
															+	"github.com/stretchr/testify/require"
														
 
															+)
														
 
															+
														
 
															+func TestEstimateGPULayers(t *testing.T) {
														
 
															+	envconfig.Debug = true
														
 
															+	modelName := "dummy"
														
 
															+	f, err := os.CreateTemp(t.TempDir(), modelName)
														
 
															+	assert.Nil(t, err)
														
 
															+	defer f.Close()
														
 
															+	gguf := NewGGUFV3(binary.LittleEndian)
														
 
															+	inputLayerCount := 5
														
 
															+	tensors := []Tensor{
														
 
															+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+	}
														
 
															+	assert.Equal(t, inputLayerCount+1, len(tensors))
														
 
															+	err = gguf.Encode(f, KV{
														
 
															+		"general.architecture":          "llama",
														
 
															+		"general.name":                  "name",
														
 
															+		"llama.context_length":          uint32(32),
														
 
															+		"llama.embedding_length":        uint32(4096),
														
 
															+		"llama.block_count":             uint32(inputLayerCount),
														
 
															+		"llama.attention.head_count":    uint32(32),
														
 
															+		"llama.attention.head_count_kv": uint32(32),
														
 
															+		"tokenizer.ggml.tokens":         []string{" "},
														
 
															+		"tokenizer.ggml.scores":         []float32{0},
														
 
															+		"tokenizer.ggml.token_type":     []int32{0},
														
 
															+	}, tensors)
														
 
															+	require.NoError(t, err)
														
 
															+
														
 
															+	ggml, err := LoadModel(f.Name())
														
 
															+	require.NoError(t, err)
														
 
															+
														
 
															+	// Simple CPU scenario
														
 
															+	gpus := []gpu.GpuInfo{
														
 
															+		{
														
 
															+			Library: "cpu",
														
 
															+		},
														
 
															+	}
														
 
															+	projectors := []string{}
														
 
															+	opts := api.DefaultOptions()
														
 
															+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+	assert.Equal(t, 0, estimate.Layers)
														
 
															+	assert.Equal(t, uint64(0), estimate.Graph)
														
 
															+
														
 
															+	// derived from the dummy ggml file above
														
 
															+	graphPartialOffload := uint64(202377216)
														
 
															+	graphFullOffload := uint64(171968512)
														
 
															+	layerSize := uint64(33554436)
														
 
															+	projectorSize := uint64(0)
														
 
															+	memoryLayerOutput := uint64(4)
														
 
															+
														
 
															+	// Dual CUDA scenario with assymetry
														
 
															+	gpuMinimumMemory := uint64(2048)
														
 
															+	gpus = []gpu.GpuInfo{
														
 
															+		{
														
 
															+			Library:       "cuda",
														
 
															+			MinimumMemory: gpuMinimumMemory,
														
 
															+		},
														
 
															+		{
														
 
															+			Library:       "cuda",
														
 
															+			MinimumMemory: gpuMinimumMemory,
														
 
															+		},
														
 
															+	}
														
 
															+	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
														
 
															+	for i, s := range [][]uint64{
														
 
															+		{1, 1, 1, 1},
														
 
															+		{2, 1, 2, 1},
														
 
															+		{2, 2, 2, 2},
														
 
															+		{1, 2, 1, 2},
														
 
															+		{3, 3, 3, 3},
														
 
															+		{4, 4, 3, 3},
														
 
															+		{6, 6, 3, 3},
														
 
															+		{0, 3, 0, 3},
														
 
															+	} {
														
 
															+		gpus[0].FreeMemory = 0
														
 
															+		gpus[1].FreeMemory = 0
														
 
															+		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
														
 
															+		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
														
 
															+		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
														
 
															+		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
														
 
															+		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
														
 
															+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
														
 
															+		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
														
 
															+		var layerSums uint64
														
 
															+		for _, b := range estimate.GPUSizes {
														
 
															+			layerSums += b
														
 
															+		}
														
 
															+		if estimate.Layers < inputLayerCount+1 {
														
 
															+			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
														
 
															+			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
														
 
															+		} else {
														
 
															+			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
														
 
															+			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+}
														
--- a/llm/server.go
+++ b/llm/server.go
@@ -49,13 +49,11 @@ type llmServer struct {
 
															 	status  *StatusWriter
														
 
															 	options api.Options
														
 
															-	// TODO - this should be broken down by GPU
														
 
															-	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
														
 
															-	estimatedTotal uint64 // Total size of model
														
 
															-	totalLayers    uint64
														
 
															-	gpuCount       int
														
 
															-	loadDuration   time.Duration // Record how long it took the model to load
														
 
															-	loadProgress   float32
														
 
															+	estimate     MemoryEstimate
														
 
															+	totalLayers  uint64
														
 
															+	gpuCount     int
														
 
															+	loadDuration time.Duration // Record how long it took the model to load
														
 
															+	loadProgress float32
														
 
															 	sem *semaphore.Weighted
														
 
															 }
														
@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) {
 
															 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
														
 
															 	var err error
														
 
															 	var cpuRunner string
														
 
															-	var estimatedVRAM uint64
														
 
															-	var estimatedTotal uint64
														
 
															+	var estimate MemoryEstimate
														
 
															 	var systemMemory uint64
														
 
															 	gpuCount := len(gpus)
														
 
															 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
														
@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 		cpuRunner = serverForCpu()
														
 
															 		gpuCount = 0
														
 
															-		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															 	} else {
														
 
															 		if gpus[0].Library == "metal" {
														
 
															 			memInfo, err := gpu.GetCPUMem()
														
@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
														
 
															 			}
														
 
															 		}
														
 
															-		var layers int
														
 
															-		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
														
 
															 		switch {
														
 
															-		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
														
 
															+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
														
 
															 			// disable partial offloading when model is greater than total system memory as this
														
 
															 			// can lead to locking up the system
														
 
															 			opts.NumGPU = 0
														
 
															-		case gpus[0].Library != "metal" && layers == 0:
														
 
															+		case gpus[0].Library != "metal" && estimate.Layers == 0:
														
 
															 			// Don't bother loading into the GPU if no layers can fit
														
 
															 			cpuRunner = serverForCpu()
														
 
															 			gpuCount = 0
														
 
															-		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
														
 
															-			opts.NumGPU = layers
														
 
															+		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
														
 
															+			opts.NumGPU = estimate.Layers
														
 
															 		}
														
 
															 	}
														
@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
														
 
															+	if estimate.TensorSplit != "" {
														
 
															+		params = append(params, "--tensor-split", estimate.TensorSplit)
														
 
															+	}
														
 
															+
														
 
															+	if estimate.TensorSplit != "" {
														
 
															+		params = append(params, "--tensor-split", estimate.TensorSplit)
														
 
															+	}
														
 
															+
														
 
															 	for i := range len(servers) {
														
 
															 		dir := availableServers[servers[i]]
														
 
															 		if dir == "" {
														
@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 		}
														
 
															 		s := &llmServer{
														
 
															-			port:           port,
														
 
															-			cmd:            exec.Command(server, finalParams...),
														
 
															-			status:         NewStatusWriter(os.Stderr),
														
 
															-			options:        opts,
														
 
															-			estimatedVRAM:  estimatedVRAM,
														
 
															-			estimatedTotal: estimatedTotal,
														
 
															-			sem:            semaphore.NewWeighted(int64(numParallel)),
														
 
															-			totalLayers:    ggml.KV().BlockCount() + 1,
														
 
															-			gpuCount:       gpuCount,
														
 
															-			done:           make(chan error, 1),
														
 
															+			port:        port,
														
 
															+			cmd:         exec.Command(server, finalParams...),
														
 
															+			status:      NewStatusWriter(os.Stderr),
														
 
															+			options:     opts,
														
 
															+			estimate:    estimate,
														
 
															+			sem:         semaphore.NewWeighted(int64(numParallel)),
														
 
															+			totalLayers: ggml.KV().BlockCount() + 1,
														
 
															+			gpuCount:    gpuCount,
														
 
															+			done:        make(chan error, 1),
														
 
															 		}
														
 
															 		s.cmd.Env = os.Environ()
														
@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error {
 
															 }
														
 
															 func (s *llmServer) EstimatedVRAM() uint64 {
														
 
															-	return s.estimatedVRAM
														
 
															+	return s.estimate.VRAMSize
														
 
															 }
														
 
															 func (s *llmServer) EstimatedTotal() uint64 {
														
 
															-	return s.estimatedTotal
														
 
															+	return s.estimate.TotalSize
														
 
															 }
														
 
															 func parseDurationMs(ms float64) time.Duration {
														
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 
															 		"tokenizer.ggml.token_type":     []int32{0},
														
 
															 	}, []llm.Tensor{
														
 
															 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
														
 
															 	})
														
 
															 	require.NoError(t, err)