浏览代码

Improve multi-gpu handling at the limit

Still not complete, needs some refinement to our prediction to understand the
discrete GPUs available space so we can see how many layers fit in each one
since we can't split one layer across multiple GPUs we can't treat free space
as one logical block
Daniel Hiltgen 11 月之前
父节点
当前提交
6fd04ca922
共有 11 个文件被更改,包括 387 次插入87 次删除
  1. 1 1
      gpu/amd_linux.go
  2. 0 4
      gpu/gpu.go
  3. 3 3
      gpu/types.go
  4. 19 1
      integration/concurrency_test.go
  5. 1 1
      integration/context_test.go
  6. 1 1
      integration/utils_test.go
  7. 1 0
      llm/ggml.go
  8. 213 48
      llm/memory.go
  9. 116 0
      llm/memory_test.go
  10. 31 28
      llm/server.go
  11. 1 0
      server/sched_test.go

+ 1 - 1
gpu/amd_linux.go

@@ -27,7 +27,7 @@ const (
 	GPUTotalMemoryFileGlob = "mem_banks/*/properties" // size_in_bytes line
 
 	// Direct Rendering Manager sysfs location
-	DRMDeviceDirGlob   = "/sys/class/drm/card[0-9]/device"
+	DRMDeviceDirGlob   = "/sys/class/drm/card*/device"
 	DRMTotalMemoryFile = "mem_info_vram_total"
 	DRMUsedMemoryFile  = "mem_info_vram_used"
 

+ 0 - 4
gpu/gpu.go

@@ -246,10 +246,6 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		// TODO - implement
-
-		// TODO refine the discovery to only gather total memory
-
 		// On windows we bundle the nvidia library one level above the runner dir
 		depPath := ""
 		if runtime.GOOS == "windows" && envconfig.RunnersDir != "" {

+ 3 - 3
gpu/types.go

@@ -44,14 +44,14 @@ type CPUInfo struct {
 
 type CudaGPUInfo struct {
 	GpuInfo
-	index int // device index
+	index int // nolint: unused
 }
 type CudaGPUInfoList []CudaGPUInfo
 
 type RocmGPUInfo struct {
 	GpuInfo
-	usedFilepath string // linux
-	index        int    // device index on windows
+	usedFilepath string // nolint: unused
+	index        int    // nolint: unused
 }
 type RocmGPUInfoList []RocmGPUInfo
 

+ 19 - 1
integration/concurrency_test.go

@@ -38,7 +38,7 @@ func TestMultiModelConcurrency(t *testing.T) {
 		}
 		resp = [2][]string{
 			[]string{"sunlight"},
-			[]string{"england", "english", "massachusetts", "pilgrims"},
+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
 		}
 	)
 	var wg sync.WaitGroup
@@ -229,5 +229,23 @@ func TestMultiModelStress(t *testing.T) {
 			}
 		}(i)
 	}
+	go func() {
+		for {
+			time.Sleep(2 * time.Second)
+			select {
+			case <-ctx.Done():
+				return
+			default:
+				models, err := client.ListRunning(ctx)
+				if err != nil {
+					slog.Warn("failed to list running models", "error", err)
+					continue
+				}
+				for _, m := range models.Models {
+					slog.Info("loaded model snapshot", "model", m)
+				}
+			}
+		}
+	}()
 	wg.Wait()
 }

+ 1 - 1
integration/context_test.go

@@ -11,7 +11,7 @@ import (
 )
 
 func TestContextExhaustion(t *testing.T) {
-	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) // TODO maybe shorter?
+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute) // Longer needed for small footprint GPUs
 	defer cancel()
 	// Set up the test data
 	req := api.GenerateRequest{

+ 1 - 1
integration/utils_test.go

@@ -331,7 +331,7 @@ func GenerateRequests() ([]api.GenerateRequest, [][]string) {
 		[][]string{
 			[]string{"sunlight"},
 			[]string{"soil", "organic", "earth", "black", "tan"},
-			[]string{"england", "english", "massachusetts", "pilgrims"},
+			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
 			[]string{"fourth", "july", "declaration", "independence"},
 			[]string{"nitrogen", "oxygen", "carbon", "dioxide"},
 		}

+ 1 - 0
llm/ggml.go

@@ -307,6 +307,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
 
 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
+			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)

+ 213 - 48
llm/memory.go

@@ -3,9 +3,10 @@ package llm
 import (
 	"fmt"
 	"log/slog"
+	"strconv"
+	"strings"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -16,7 +17,8 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 		var layerCount int
-		layerCount, estimatedVRAM, _ = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+		layerCount, estimatedVRAM = estimate.Layers, estimate.VRAMSize
 		if opts.NumGPU < 0 {
 			if layerCount > 0 && layerCount >= int(ggml.KV().BlockCount()+1) {
 				return true, estimatedVRAM
@@ -30,24 +32,68 @@ func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors
 	return false, estimatedVRAM
 }
 
+type MemoryEstimate struct {
+	// How many layers we predict we can load
+	Layers int
+
+	// The size of the graph which occupies the main GPU
+	Graph uint64
+
+	// How much VRAM will be allocated given the number of layers we predict
+	VRAMSize uint64
+
+	// The total size of the model if loaded into VRAM.  If all layers are loaded, VRAMSize == TotalSize
+	TotalSize uint64
+
+	// For multi-GPU scenarios, this provides the tensor split parameter
+	TensorSplit string
+
+	// For multi-GPU scenarios, this is the size in bytes per GPU
+	GPUSizes []uint64
+}
+
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) (int, uint64, uint64) {
-	var memoryAvailable uint64
-	for _, info := range gpus {
-		memoryAvailable += info.FreeMemory
-	}
-	if envconfig.MaxVRAM > 0 {
-		memoryAvailable = envconfig.MaxVRAM
-	}
+func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+	// Graph size for a partial offload, applies to all GPUs
+	var graphPartialOffload uint64
+
+	// Graph size when all layers are offloaded, applies to all GPUs
+	var graphFullOffload uint64
+
+	// Final graph offload once we know full or partial
+	var graphOffload uint64
+
+	// Projectors loaded into GPU0 only
+	var projectorSize uint64
+
+	// Conditional output size on GPU 0
+	var memoryLayerOutput uint64
+	var includeOutput bool
+
+	// One extra layer as a pad for each GPU
+	var layerBuffer uint64
 
-	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", format.HumanBytes2(memoryAvailable))
+	// The sizes of the main layers
+	var layerSizes []uint64
 
-	// TODO - this is probably wrong, first GPU vs secondaries will have different overheads
-	memoryMinimum := gpus[0].MinimumMemory
+	// The sum of all the layer sizes (just for logging)
+	var memoryWeights uint64
+
+	// True if all the layers are loaded
+	var fullyLoaded bool
+
+	// Overflow that didn't fit into the GPU
+	var overflow uint64
+
+	availableList := make([]string, len(gpus))
+	for i, gpu := range gpus {
+		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
+	}
+	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)
 
 	for _, projector := range projectors {
-		memoryMinimum += projectorMemoryRequirements(projector)
+		projectorSize += projectorMemoryRequirements(projector)
 
 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
@@ -56,40 +102,28 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	layers := ggml.Tensors().Layers()
 	// add one layer worth of memory as a buffer
 	if blk0, ok := layers["blk.0"]; ok {
-		memoryMinimum += blk0.size()
+		layerBuffer = blk0.size()
 	}
 
 	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
 	var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
 
-	graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
+	graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
 	if graphPartialOffload == 0 {
 		graphPartialOffload = ggml.KV().GQA() * kv / 6
 	}
-
 	if graphFullOffload == 0 {
 		graphFullOffload = graphPartialOffload
 	}
 
-	graphFullOffload *= uint64(len(gpus))
-	graphPartialOffload *= uint64(len(gpus))
-
 	// on metal there's no partial offload overhead
 	if gpus[0].Library == "metal" {
 		graphPartialOffload = graphFullOffload
 	}
 
-	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
-	memoryRequiredTotal := memoryMinimum + graphFullOffload
-
-	// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
-	memoryRequiredPartial := memoryMinimum + graphPartialOffload
-
-	var memoryLayerOutput uint64
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.size()
 	}
-
 	if layer, ok := layers["output"]; ok {
 		memoryLayerOutput += layer.size()
 	} else if layer, ok := layers["token_embd"]; ok {
@@ -97,38 +131,144 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	}
 
 	if gpus[0].Library == "metal" && opts.UseMMap {
-		// memory is preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
-		memoryRequiredPartial += memoryLayerOutput
+		includeOutput = true
+	} else if gpus[0].Library != "metal" || !opts.UseMMap {
+		includeOutput = true
 	}
 
+	gpuZeroOverhead := projectorSize
+	if includeOutput {
+		gpuZeroOverhead += memoryLayerOutput
+	}
+
+	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
+	layerCounts := make([]int, len(gpus))
+	gpuAllocations := make([]uint64, len(gpus))
+	type gs struct {
+		i int
+		g *gpu.GpuInfo
+	}
+	gpusWithSpace := []gs{}
+	for i := range gpus {
+		var gzo uint64
+		if len(gpusWithSpace) == 0 {
+			gzo = gpuZeroOverhead
+		}
+		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
+		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerBuffer {
+			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
+			continue
+		}
+		gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
+		gpuAllocations[i] += gpus[i].MinimumMemory + layerBuffer // We hold off on graph until we know partial vs. full
+	}
+
+	var gpuZeroID int
+	if len(gpusWithSpace) > 0 {
+		gpuZeroID = gpusWithSpace[0].i
+		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	}
+
+	layerSizes = make([]uint64, int(ggml.KV().BlockCount()))
 	for i := range int(ggml.KV().BlockCount()) {
 		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
 			memoryLayer := blk.size()
 
 			// KV is proportional to the number of layers
 			memoryLayer += kv / ggml.KV().BlockCount()
+			layerSizes[i] = memoryLayer
+			memoryWeights += memoryLayer
+		}
+	}
 
-			memoryRequiredTotal += memoryLayer
-			if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredPartial+memoryLayer) {
-				memoryRequiredPartial += memoryLayer
+	// For all the layers, find where they can fit on the GPU(s)
+	for i := range layerSizes {
+		if layerSizes[i] == 0 {
+			continue
+		}
+		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
+			// Stop allocating on GPU(s) once we hit the users target NumGPU
+			continue
+		}
+
+		// distribute the layers across the GPU(s) that have space
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[i%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > used+layerSizes[i] {
+				gpuAllocations[g.i] += layerSizes[i]
+				layerCounts[g.i]++
 				layerCount++
+				break
+			} else {
+				gpusWithSpace = append(gpusWithSpace[:i%j], gpusWithSpace[i%j+1:]...)
 			}
 		}
+
+	}
+	if layerCount >= int(ggml.KV().BlockCount()) {
+		fullyLoaded = true
+	} else {
+		for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
+			overflow += layerSizes[i]
+		}
+	}
+	// Find where the output fits
+	if includeOutput && memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
+		for j := len(gpusWithSpace); j > 0; j-- {
+			g := gpusWithSpace[layerCount%j]
+			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+			if g.g.FreeMemory > used+memoryLayerOutput {
+				gpuAllocations[g.i] += memoryLayerOutput
+				layerCounts[g.i]++
+				layerCount++
+				break
+			}
+		}
+		if layerCount < int(ggml.KV().BlockCount())+1 {
+			fullyLoaded = false
+			overflow += memoryLayerOutput
+		}
 	}
 
-	if gpus[0].Library != "metal" || !opts.UseMMap {
-		// memory was not preallocated for output tensors
-		memoryRequiredTotal += memoryLayerOutput
+	// Add the applicable (full or partial) graph allocations
+	for i := range gpus {
+		if layerCounts[i] <= 0 {
+			continue
+		}
+		if fullyLoaded {
+			gpuAllocations[i] += graphFullOffload
+		} else {
+			gpuAllocations[i] += graphPartialOffload
+		}
+	}
+	if fullyLoaded {
+		graphOffload = graphFullOffload
+	} else {
+		graphOffload = graphPartialOffload
 	}
 
-	if (opts.NumGPU >= 0 && layerCount+1 <= opts.NumGPU) || (opts.NumGPU < 0 && memoryAvailable > memoryRequiredTotal) {
-		layerCount = int(ggml.KV().BlockCount()) + 1
-		memoryRequiredPartial = memoryRequiredTotal
+	// Summaries for the log
+	var memoryRequiredPartial, memoryRequiredTotal uint64
+	for i := range gpuAllocations {
+		memoryRequiredPartial += gpuAllocations[i]
+
 	}
+	memoryRequiredTotal = memoryRequiredPartial + overflow
 
-	memoryWeights := memoryRequiredTotal - memoryMinimum - graphFullOffload - kv
+	tensorSplit := ""
+	if len(gpus) > 1 {
+		splits := make([]string, len(gpus))
+		for i, count := range layerCounts {
+			splits[i] = strconv.Itoa(count)
+		}
+		tensorSplit = strings.Join(splits, ",")
+	}
+	allocationsList := []string{}
+	for _, a := range gpuAllocations {
+		allocationsList = append(allocationsList, format.HumanBytes2(a))
+	}
 
 	slog.Info(
 		"offload to gpu",
@@ -136,13 +276,17 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			"layers",
 			// requested number of layers to offload
 			"requested", opts.NumGPU,
+			// The number of layers the model has (including output)
+			"model", int(ggml.KV().BlockCount())+1,
 			// estimated number of layers that can be offloaded
-			"real", layerCount,
+			"offload", layerCount,
+			// multi-gpu split for tesnors
+			"split", tensorSplit,
 		),
 		slog.Group(
 			"memory",
-			// memory available for offloading
-			"available", format.HumanBytes2(memoryAvailable),
+			// memory available by GPU for offloading
+			"available", availableList,
 			slog.Group(
 				"required",
 				// memory required for full offloading
@@ -151,6 +295,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 				"partial", format.HumanBytes2(memoryRequiredPartial),
 				// memory of KV cache
 				"kv", format.HumanBytes2(kv),
+				// Allocations across the GPUs
+				"allocations", allocationsList,
 			),
 			slog.Group(
 				"weights",
@@ -171,12 +317,31 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		),
 	)
 	if gpus[0].Library == "cpu" {
-		return 0, 0, memoryRequiredTotal
+		return MemoryEstimate{
+			Layers:    0,
+			Graph:     0,
+			VRAMSize:  0,
+			TotalSize: memoryRequiredTotal,
+			GPUSizes:  []uint64{},
+		}
 	}
-	if memoryRequiredPartial > memoryAvailable {
+	if layerCount == 0 {
 		slog.Debug("insufficient VRAM to load any model layers")
-		return 0, 0, memoryRequiredTotal
+		return MemoryEstimate{
+			Layers:    0,
+			Graph:     0,
+			VRAMSize:  0,
+			TotalSize: memoryRequiredTotal,
+			GPUSizes:  []uint64{},
+		}
 	}
 
-	return layerCount, memoryRequiredPartial, memoryRequiredTotal
+	return MemoryEstimate{
+		Layers:      layerCount,
+		Graph:       graphOffload,
+		VRAMSize:    memoryRequiredPartial,
+		TotalSize:   memoryRequiredTotal,
+		TensorSplit: tensorSplit,
+		GPUSizes:    gpuAllocations,
+	}
 }

+ 116 - 0
llm/memory_test.go

@@ -0,0 +1,116 @@
+package llm
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"os"
+	"testing"
+
+	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
+	"github.com/ollama/ollama/gpu"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestEstimateGPULayers(t *testing.T) {
+	envconfig.Debug = true
+	modelName := "dummy"
+	f, err := os.CreateTemp(t.TempDir(), modelName)
+	assert.Nil(t, err)
+	defer f.Close()
+	gguf := NewGGUFV3(binary.LittleEndian)
+	inputLayerCount := 5
+	tensors := []Tensor{
+		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+	}
+	assert.Equal(t, inputLayerCount+1, len(tensors))
+	err = gguf.Encode(f, KV{
+		"general.architecture":          "llama",
+		"general.name":                  "name",
+		"llama.context_length":          uint32(32),
+		"llama.embedding_length":        uint32(4096),
+		"llama.block_count":             uint32(inputLayerCount),
+		"llama.attention.head_count":    uint32(32),
+		"llama.attention.head_count_kv": uint32(32),
+		"tokenizer.ggml.tokens":         []string{" "},
+		"tokenizer.ggml.scores":         []float32{0},
+		"tokenizer.ggml.token_type":     []int32{0},
+	}, tensors)
+	require.NoError(t, err)
+
+	ggml, err := LoadModel(f.Name())
+	require.NoError(t, err)
+
+	// Simple CPU scenario
+	gpus := []gpu.GpuInfo{
+		{
+			Library: "cpu",
+		},
+	}
+	projectors := []string{}
+	opts := api.DefaultOptions()
+	estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
+	assert.Equal(t, 0, estimate.Layers)
+	assert.Equal(t, uint64(0), estimate.Graph)
+
+	// derived from the dummy ggml file above
+	graphPartialOffload := uint64(202377216)
+	graphFullOffload := uint64(171968512)
+	layerSize := uint64(33554436)
+	projectorSize := uint64(0)
+	memoryLayerOutput := uint64(4)
+
+	// Dual CUDA scenario with assymetry
+	gpuMinimumMemory := uint64(2048)
+	gpus = []gpu.GpuInfo{
+		{
+			Library:       "cuda",
+			MinimumMemory: gpuMinimumMemory,
+		},
+		{
+			Library:       "cuda",
+			MinimumMemory: gpuMinimumMemory,
+		},
+	}
+	// Nested array: GPU0 layer space, GPU1 layer space, expected gpu0, expected gpu1
+	for i, s := range [][]uint64{
+		{1, 1, 1, 1},
+		{2, 1, 2, 1},
+		{2, 2, 2, 2},
+		{1, 2, 1, 2},
+		{3, 3, 3, 3},
+		{4, 4, 3, 3},
+		{6, 6, 3, 3},
+		{0, 3, 0, 3},
+	} {
+		gpus[0].FreeMemory = 0
+		gpus[1].FreeMemory = 0
+		gpus[0].FreeMemory += projectorSize + memoryLayerOutput
+		gpus[0].FreeMemory += gpuMinimumMemory + layerSize + s[0]*layerSize + 1
+		gpus[1].FreeMemory += gpuMinimumMemory + layerSize + s[1]*layerSize + 1
+		gpus[0].FreeMemory += max(graphFullOffload, graphPartialOffload)
+		gpus[1].FreeMemory += max(graphFullOffload, graphPartialOffload)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
+		assert.Equal(t, int(s[2]+s[3]), estimate.Layers, "scenario %d: %v", i, s)
+		assert.Equal(t, fmt.Sprintf("%d,%d", s[2], s[3]), estimate.TensorSplit, "scenario %d: %v", i, s)
+		var layerSums uint64
+		for _, b := range estimate.GPUSizes {
+			layerSums += b
+		}
+		if estimate.Layers < inputLayerCount+1 {
+			assert.Less(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+			assert.Equal(t, estimate.VRAMSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+		} else {
+			assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
+			assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
+		}
+	}
+
+}

+ 31 - 28
llm/server.go

@@ -49,13 +49,11 @@ type llmServer struct {
 	status  *StatusWriter
 	options api.Options
 
-	// TODO - this should be broken down by GPU
-	estimatedVRAM  uint64 // Estimated usage of VRAM by the loaded model
-	estimatedTotal uint64 // Total size of model
-	totalLayers    uint64
-	gpuCount       int
-	loadDuration   time.Duration // Record how long it took the model to load
-	loadProgress   float32
+	estimate     MemoryEstimate
+	totalLayers  uint64
+	gpuCount     int
+	loadDuration time.Duration // Record how long it took the model to load
+	loadProgress float32
 
 	sem *semaphore.Weighted
 }
@@ -80,8 +78,7 @@ func LoadModel(model string) (*GGML, error) {
 func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
 	var err error
 	var cpuRunner string
-	var estimatedVRAM uint64
-	var estimatedTotal uint64
+	var estimate MemoryEstimate
 	var systemMemory uint64
 	gpuCount := len(gpus)
 	if (len(gpus) == 1 && gpus[0].Library == "cpu") || opts.NumGPU == 0 {
@@ -89,7 +86,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 		cpuRunner = serverForCpu()
 		gpuCount = 0
-		_, _, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 	} else {
 		if gpus[0].Library == "metal" {
 			memInfo, err := gpu.GetCPUMem()
@@ -100,20 +97,19 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 				slog.Debug("system memory", "total", format.HumanBytes2(systemMemory))
 			}
 		}
-		var layers int
-		layers, estimatedVRAM, estimatedTotal = EstimateGPULayers(gpus, ggml, projectors, opts)
+		estimate = EstimateGPULayers(gpus, ggml, projectors, opts)
 
 		switch {
-		case gpus[0].Library == "metal" && estimatedVRAM > systemMemory:
+		case gpus[0].Library == "metal" && estimate.VRAMSize > systemMemory:
 			// disable partial offloading when model is greater than total system memory as this
 			// can lead to locking up the system
 			opts.NumGPU = 0
-		case gpus[0].Library != "metal" && layers == 0:
+		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = serverForCpu()
 			gpuCount = 0
-		case opts.NumGPU < 0 && layers > 0 && gpus[0].Library != "cpu":
-			opts.NumGPU = layers
+		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
+			opts.NumGPU = estimate.Layers
 		}
 	}
 
@@ -232,6 +228,14 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
 
+	if estimate.TensorSplit != "" {
+		params = append(params, "--tensor-split", estimate.TensorSplit)
+	}
+
+	if estimate.TensorSplit != "" {
+		params = append(params, "--tensor-split", estimate.TensorSplit)
+	}
+
 	for i := range len(servers) {
 		dir := availableServers[servers[i]]
 		if dir == "" {
@@ -299,16 +303,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 
 		s := &llmServer{
-			port:           port,
-			cmd:            exec.Command(server, finalParams...),
-			status:         NewStatusWriter(os.Stderr),
-			options:        opts,
-			estimatedVRAM:  estimatedVRAM,
-			estimatedTotal: estimatedTotal,
-			sem:            semaphore.NewWeighted(int64(numParallel)),
-			totalLayers:    ggml.KV().BlockCount() + 1,
-			gpuCount:       gpuCount,
-			done:           make(chan error, 1),
+			port:        port,
+			cmd:         exec.Command(server, finalParams...),
+			status:      NewStatusWriter(os.Stderr),
+			options:     opts,
+			estimate:    estimate,
+			sem:         semaphore.NewWeighted(int64(numParallel)),
+			totalLayers: ggml.KV().BlockCount() + 1,
+			gpuCount:    gpuCount,
+			done:        make(chan error, 1),
 		}
 
 		s.cmd.Env = os.Environ()
@@ -1004,11 +1007,11 @@ func (s *llmServer) Close() error {
 }
 
 func (s *llmServer) EstimatedVRAM() uint64 {
-	return s.estimatedVRAM
+	return s.estimate.VRAMSize
 }
 
 func (s *llmServer) EstimatedTotal() uint64 {
-	return s.estimatedTotal
+	return s.estimate.TotalSize
 }
 
 func parseDurationMs(ms float64) time.Duration {

+ 1 - 0
server/sched_test.go

@@ -129,6 +129,7 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
 		"tokenizer.ggml.token_type":     []int32{0},
 	}, []llm.Tensor{
 		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
+		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
 	})
 	require.NoError(t, err)