6 bulan lalu · 05cd82ef94
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -1,6 +1,6 @@
 
				 //go:build linux || windows
			
 
				 
			
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"errors"
			
--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"errors"
			
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"bufio"
			
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"bytes"
			
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"os"
			
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -1,6 +1,6 @@
 
				 //go:build linux || windows
			
 
				 
			
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"log/slog"
			
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -1,6 +1,6 @@
 
				 //go:build linux || windows
			
 
				 
			
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 /*
			
 
				 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
			
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -1,6 +1,6 @@
 
				 //go:build darwin
			
 
				 
			
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 /*
			
 
				 #cgo CFLAGS: -x objective-c
			
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
--- a/discover/gpu_info_darwin.h
+++ b/discover/gpu_info_darwin.h
--- a/discover/gpu_info_darwin.m
+++ b/discover/gpu_info_darwin.m
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
--- a/discover/gpu_info_nvcuda.h
+++ b/discover/gpu_info_nvcuda.h
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"bufio"
			
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@@ -1,6 +1,6 @@
 
				 //go:build linux || windows
			
 
				 
			
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"log/slog"
			
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"runtime"
			
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"fmt"
			
--- a/discover/gpu_windows_test.go
+++ b/discover/gpu_windows_test.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import "testing"
			
 
				 
			
--- a/discover/types.go
+++ b/discover/types.go
@@ -1,4 +1,4 @@
 
				-package gpu
			
 
				+package discover
			
 
				 
			
 
				 import (
			
 
				 	"fmt"
			
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,13 +7,13 @@ import (
 
				 	"strings"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 )
			
 
				 
			
 
				 // This algorithm looks for a complete fit to determine if we need to unload other models
			
 
				-func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
			
 
				+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
			
 
				 	// Split up the GPUs by type and try them
			
 
				 	var estimatedVRAM uint64
			
 
				 	for _, gpus := range allGpus.ByLibrary() {
			
@@ -67,7 +67,7 @@ type MemoryEstimate struct {
 
				 
			
 
				 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
			
 
				 // The GPUs provided must all be the same Library
			
 
				-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
			
 
				+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
			
 
				 	// Graph size for a partial offload, applies to all GPUs
			
 
				 	var graphPartialOffload uint64
			
 
				 
			
@@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
				 	gpuAllocations := make([]uint64, len(gpus))
			
 
				 	type gs struct {
			
 
				 		i int
			
 
				-		g *gpu.GpuInfo
			
 
				+		g *discover.GpuInfo
			
 
				 	}
			
 
				 	gpusWithSpace := []gs{}
			
 
				 	for i := range gpus {
			
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -10,7 +10,7 @@ import (
 
				 	"github.com/stretchr/testify/require"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 )
			
 
				 
			
 
				 func TestEstimateGPULayers(t *testing.T) {
			
@@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 	}
			
 
				 
			
 
				 	// Simple CPU scenario
			
 
				-	gpus := []gpu.GpuInfo{
			
 
				+	gpus := []discover.GpuInfo{
			
 
				 		{
			
 
				 			Library: "cpu",
			
 
				 		},
			
@@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
				 
			
 
				 	// Dual CUDA scenario with assymetry
			
 
				 	gpuMinimumMemory := uint64(2048)
			
 
				-	gpus = []gpu.GpuInfo{
			
 
				+	gpus = []discover.GpuInfo{
			
 
				 		{
			
 
				 			Library:       "cuda",
			
 
				 			MinimumMemory: gpuMinimumMemory,
			
--- a/llm/server.go
+++ b/llm/server.go
@@ -26,9 +26,9 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/build"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 	"github.com/ollama/ollama/llama"
			
 
				 	"github.com/ollama/ollama/runners"
			
 
				 )
			
@@ -61,8 +61,8 @@ type llmServer struct {
 
				 	estimate    MemoryEstimate
			
 
				 	totalLayers uint64
			
 
				 	// gpuCount     int
			
 
				-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
			
 
				-	loadDuration time.Duration   // Record how long it took the model to load
			
 
				+	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
			
 
				+	loadDuration time.Duration        // Record how long it took the model to load
			
 
				 	loadProgress float32
			
 
				 
			
 
				 	sem *semaphore.Weighted
			
@@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
				 
			
 
				 // NewLlamaServer will run a server for the given GPUs
			
 
				 // The gpu list must be a single family.
			
 
				-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				 	var err error
			
 
				 	var cpuRunner string
			
 
				 	var estimate MemoryEstimate
			
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 	var systemFreeMemory uint64
			
 
				 	var systemSwapFreeMemory uint64
			
 
				 
			
 
				-	systemInfo := gpu.GetSystemInfo()
			
 
				+	systemInfo := discover.GetSystemInfo()
			
 
				 	systemTotalMemory = systemInfo.System.TotalMemory
			
 
				 	systemFreeMemory = systemInfo.System.FreeMemory
			
 
				 	systemSwapFreeMemory = systemInfo.System.FreeSwap
			
@@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
			
 
				 	if opts.NumGPU == 0 {
			
 
				-		gpus = gpu.GetCPUInfo()
			
 
				+		gpus = discover.GetCPUInfo()
			
 
				 	}
			
 
				 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
			
 
				 		cpuRunner = runners.ServerForCpu()
			
@@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		case gpus[0].Library != "metal" && estimate.Layers == 0:
			
 
				 			// Don't bother loading into the GPU if no layers can fit
			
 
				 			cpuRunner = runners.ServerForCpu()
			
 
				-			gpus = gpu.GetCPUInfo()
			
 
				+			gpus = discover.GetCPUInfo()
			
 
				 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
			
 
				 			opts.NumGPU = estimate.Layers
			
 
				 		}
			
@@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		}
			
 
				 
			
 
				 		if strings.HasPrefix(servers[i], "cpu") {
			
 
				-			gpus = gpu.GetCPUInfo()
			
 
				+			gpus = discover.GetCPUInfo()
			
 
				 		}
			
 
				 
			
 
				 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
			
--- a/runners/common.go
+++ b/runners/common.go
@@ -18,8 +18,8 @@ import (
 
				 
			
 
				 	"golang.org/x/sync/errgroup"
			
 
				 
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 )
			
 
				 
			
 
				 const (
			
@@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 
				 // serversForGpu returns a list of compatible servers give the provided GPU
			
 
				 // info, ordered by performance. assumes Init() has been called
			
 
				 // TODO - switch to metadata based mapping
			
 
				-func ServersForGpu(info gpu.GpuInfo) []string {
			
 
				+func ServersForGpu(info discover.GpuInfo) []string {
			
 
				 	// glob workDir for files that start with ollama_
			
 
				 	availableServers := GetAvailableServers(runnersDir)
			
 
				 	requested := info.Library
			
 
				-	if info.Variant != gpu.CPUCapabilityNone.String() {
			
 
				+	if info.Variant != discover.CPUCapabilityNone.String() {
			
 
				 		requested += "_" + info.Variant
			
 
				 	}
			
 
				 
			
@@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
 
				 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
			
 
				 		// Load up the best CPU variant if not primary requested
			
 
				 		if info.Library != "cpu" {
			
 
				-			variant := gpu.GetCPUCapability()
			
 
				+			variant := discover.GetCPUCapability()
			
 
				 			// If no variant, then we fall back to default
			
 
				 			// If we have a variant, try that if we find an exact match
			
 
				 			// Attempting to run the wrong CPU instructions will panic the
			
 
				 			// process
			
 
				-			if variant != gpu.CPUCapabilityNone {
			
 
				+			if variant != discover.CPUCapabilityNone {
			
 
				 				for cmp := range availableServers {
			
 
				 					if cmp == "cpu_"+variant.String() {
			
 
				 						servers = append(servers, cmp)
			
@@ -371,9 +371,9 @@ func ServerForCpu() string {
 
				 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
			
 
				 		return "metal"
			
 
				 	}
			
 
				-	variant := gpu.GetCPUCapability()
			
 
				+	variant := discover.GetCPUCapability()
			
 
				 	availableServers := GetAvailableServers(runnersDir)
			
 
				-	if variant != gpu.CPUCapabilityNone {
			
 
				+	if variant != discover.CPUCapabilityNone {
			
 
				 		for cmp := range availableServers {
			
 
				 			if cmp == "cpu_"+variant.String() {
			
 
				 				return cmp
			
--- a/server/routes.go
+++ b/server/routes.go
@@ -27,8 +27,8 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/build"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 	"github.com/ollama/ollama/openai"
			
 
				 	"github.com/ollama/ollama/parser"
			
@@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
 
				 
			
 
				 	// At startup we retrieve GPU information so we can get log messages before loading a model
			
 
				 	// This will log warnings to the log in case we have problems with detected GPUs
			
 
				-	gpus := gpu.GetGPUInfo()
			
 
				+	gpus := discover.GetGPUInfo()
			
 
				 	gpus.LogDetails()
			
 
				 
			
 
				 	err = srvr.Serve(ln)
			
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -15,7 +15,7 @@ import (
 
				 	"github.com/google/go-cmp/cmp"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 
				 	return
			
 
				 }
			
 
				 
			
 
				-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
			
 
				-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
			
 
				+	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return mock, nil
			
 
				 	}
			
 
				 }
			
@@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
 
				 			unloadedCh:    make(chan any, 1),
			
 
				 			loaded:        make(map[string]*runnerRef),
			
 
				 			newServerFn:   newMockServer(&mock),
			
 
				-			getGpuFn:      gpu.GetGPUInfo,
			
 
				-			getCpuFn:      gpu.GetCPUInfo,
			
 
				+			getGpuFn:      discover.GetGPUInfo,
			
 
				+			getCpuFn:      discover.GetCPUInfo,
			
 
				 			reschedDelay:  250 * time.Millisecond,
			
 
				-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
			
 
				+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				 				// add small delay to simulate loading
			
 
				 				time.Sleep(time.Millisecond)
			
 
				 				req.successCh <- &runnerRef{
			
@@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
 
				 			unloadedCh:    make(chan any, 1),
			
 
				 			loaded:        make(map[string]*runnerRef),
			
 
				 			newServerFn:   newMockServer(&mock),
			
 
				-			getGpuFn:      gpu.GetGPUInfo,
			
 
				-			getCpuFn:      gpu.GetCPUInfo,
			
 
				+			getGpuFn:      discover.GetGPUInfo,
			
 
				+			getCpuFn:      discover.GetCPUInfo,
			
 
				 			reschedDelay:  250 * time.Millisecond,
			
 
				-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
			
 
				+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				 				// add small delay to simulate loading
			
 
				 				time.Sleep(time.Millisecond)
			
 
				 				req.successCh <- &runnerRef{
			
--- a/server/sched.go
+++ b/server/sched.go
@@ -15,9 +15,9 @@ import (
 
				 	"time"
			
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/envconfig"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -41,10 +41,10 @@ type Scheduler struct {
 
				 	loaded   map[string]*runnerRef
			
 
				 	loadedMu sync.Mutex
			
 
				 
			
 
				-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
			
 
				-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
			
 
				-	getGpuFn     func() gpu.GpuInfoList
			
 
				-	getCpuFn     func() gpu.GpuInfoList
			
 
				+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
			
 
				+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
			
 
				+	getGpuFn     func() discover.GpuInfoList
			
 
				+	getCpuFn     func() discover.GpuInfoList
			
 
				 	reschedDelay time.Duration
			
 
				 }
			
 
				 
			
@@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 
				 		unloadedCh:    make(chan interface{}, maxQueue),
			
 
				 		loaded:        make(map[string]*runnerRef),
			
 
				 		newServerFn:   llm.NewLlamaServer,
			
 
				-		getGpuFn:      gpu.GetGPUInfo,
			
 
				-		getCpuFn:      gpu.GetCPUInfo,
			
 
				+		getGpuFn:      discover.GetGPUInfo,
			
 
				+		getCpuFn:      discover.GetCPUInfo,
			
 
				 		reschedDelay:  250 * time.Millisecond,
			
 
				 	}
			
 
				 	sched.loadFn = sched.load
			
@@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
				 				} else {
			
 
				 					// Either no models are loaded or below envconfig.MaxRunners
			
 
				 					// Get a refreshed GPU list
			
 
				-					var gpus gpu.GpuInfoList
			
 
				+					var gpus discover.GpuInfoList
			
 
				 					if pending.opts.NumGPU == 0 {
			
 
				 						gpus = s.getCpuFn()
			
 
				 					} else {
			
@@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 
				 	}()
			
 
				 }
			
 
				 
			
 
				-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
			
 
				+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
			
 
				 	if numParallel < 1 {
			
 
				 		numParallel = 1
			
 
				 	}
			
@@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 
				 	}()
			
 
				 }
			
 
				 
			
 
				-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
			
 
				+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
			
 
				 	type predKey struct {
			
 
				 		Library string
			
 
				 		ID      string
			
@@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 
				 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
			
 
				 // This routine returns the set of GPUs that do not have an active loading model.
			
 
				 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
			
 
				-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
			
 
				-	ret := append(gpu.GpuInfoList{}, allGpus...)
			
 
				+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
			
 
				+	ret := append(discover.GpuInfoList{}, allGpus...)
			
 
				 	s.loadedMu.Lock()
			
 
				 	defer s.loadedMu.Unlock()
			
 
				 	for _, runner := range s.loaded {
			
@@ -541,8 +541,8 @@ type runnerRef struct {
 
				 	// unloading bool      // set to true when we are trying to unload the runner
			
 
				 
			
 
				 	llama          llm.LlamaServer
			
 
				-	loading        bool            // True only during initial load, then false forever
			
 
				-	gpus           gpu.GpuInfoList // Recorded at time of provisioning
			
 
				+	loading        bool                 // True only during initial load, then false forever
			
 
				+	gpus           discover.GpuInfoList // Recorded at time of provisioning
			
 
				 	estimatedVRAM  uint64
			
 
				 	estimatedTotal uint64
			
 
				 
			
@@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 
				 	start := time.Now()
			
 
				 
			
 
				 	// Establish a baseline before we unload
			
 
				-	gpusBefore := gpu.GetGPUInfo()
			
 
				+	gpusBefore := discover.GetGPUInfo()
			
 
				 	var totalMemoryBefore, freeMemoryBefore uint64
			
 
				 	for _, gpu := range gpusBefore {
			
 
				 		totalMemoryBefore += gpu.TotalMemory
			
@@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 
				 			}
			
 
				 
			
 
				 			// Query GPUs, look for free to go back up
			
 
				-			gpusNow := gpu.GetGPUInfo()
			
 
				+			gpusNow := discover.GetGPUInfo()
			
 
				 			var totalMemoryNow, freeMemoryNow uint64
			
 
				 			for _, gpu := range gpusNow {
			
 
				 				totalMemoryNow += gpu.TotalMemory
			
@@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
 
				 // If the model can not be fit fully within the available GPU(s) nil is returned
			
 
				 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
			
 
				 // opts.NumCtx accordingly
			
 
				-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				 	var estimatedVRAM uint64
			
 
				 
			
 
				 	var numParallelToTry []int
			
@@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
				 
			
 
				 	for _, gl := range gpus.ByLibrary() {
			
 
				 		var ok bool
			
 
				-		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
			
 
				+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
			
 
				 
			
 
				 		// TODO - potentially sort by performance capability, existing models loaded, etc.
			
 
				 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
			
 
				 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
			
 
				-		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
			
 
				+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
			
 
				 
			
 
				 		// First attempt to fit the model into a single GPU
			
 
				 		for _, p := range numParallelToTry {
			
 
				 			req.opts.NumCtx = req.origNumCtx * p
			
 
				 			if !envconfig.SchedSpread() {
			
 
				 				for _, g := range sgl {
			
 
				-					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
			
 
				 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
			
 
				 						*numParallel = p
			
 
				-						return []gpu.GpuInfo{g}
			
 
				+						return []discover.GpuInfo{g}
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
@@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
				 }
			
 
				 
			
 
				 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
			
 
				-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
			
 
				+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
			
 
				 	if *numParallel <= 0 {
			
 
				 		*numParallel = 1
			
 
				 		req.opts.NumCtx = req.origNumCtx
			
@@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
 
				 
			
 
				 // If other runners are loaded, make sure the pending request will fit in system memory
			
 
				 // If not, pick a runner to unload, else return nil and the request can be loaded
			
 
				-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
			
 
				+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
			
 
				 	slog.Debug("evaluating if CPU model load will fit in available system memory")
			
 
				 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
			
 
				 	if estimate.TotalSize <= gpus[0].FreeMemory {
			
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -13,8 +13,8 @@ import (
 
				 
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/app/lifecycle"
			
 
				+	"github.com/ollama/ollama/discover"
			
 
				 	"github.com/ollama/ollama/format"
			
 
				-	"github.com/ollama/ollama/gpu"
			
 
				 	"github.com/ollama/ollama/llm"
			
 
				 )
			
 
				 
			
@@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
 
				 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
			
 
				 	}
			
 
				 	// Fail to load model first
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return nil, errors.New("something failed to load model blah")
			
 
				 	}
			
 
				-	gpus := gpu.GpuInfoList{}
			
 
				+	gpus := discover.GpuInfoList{}
			
 
				 	s.load(req, ggml, gpus, 0)
			
 
				 	require.Empty(t, req.successCh)
			
 
				 	require.Len(t, req.errCh, 1)
			
@@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
 
				 	require.Contains(t, err.Error(), "this model may be incompatible")
			
 
				 
			
 
				 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return server, nil
			
 
				 	}
			
 
				 	s.load(req, ggml, gpus, 0)
			
@@ -102,7 +102,7 @@ type reqBundle struct {
 
				 	ggml    *llm.GGML
			
 
				 }
			
 
				 
			
 
				-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 	return scenario.srv, nil
			
 
				 }
			
 
				 
			
@@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
				 	return b
			
 
				 }
			
 
				 
			
 
				-func getGpuFn() gpu.GpuInfoList {
			
 
				-	g := gpu.GpuInfo{Library: "metal"}
			
 
				+func getGpuFn() discover.GpuInfoList {
			
 
				+	g := discover.GpuInfo{Library: "metal"}
			
 
				 	g.TotalMemory = 24 * format.GigaByte
			
 
				 	g.FreeMemory = 12 * format.GigaByte
			
 
				-	return []gpu.GpuInfo{g}
			
 
				+	return []discover.GpuInfo{g}
			
 
				 }
			
 
				 
			
 
				-func getCpuFn() gpu.GpuInfoList {
			
 
				-	g := gpu.GpuInfo{Library: "cpu"}
			
 
				+func getCpuFn() discover.GpuInfoList {
			
 
				+	g := discover.GpuInfo{Library: "cpu"}
			
 
				 	g.TotalMemory = 32 * format.GigaByte
			
 
				 	g.FreeMemory = 26 * format.GigaByte
			
 
				-	return []gpu.GpuInfo{g}
			
 
				+	return []discover.GpuInfo{g}
			
 
				 }
			
 
				 
			
 
				 func TestRequestsSameModelSameRequest(t *testing.T) {
			
@@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
 
				 	}
			
 
				 
			
 
				 	var ggml *llm.GGML
			
 
				-	gpus := gpu.GpuInfoList{}
			
 
				+	gpus := discover.GpuInfoList{}
			
 
				 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		return server, nil
			
 
				 	}
			
 
				 	s.load(req, ggml, gpus, 0)
			
@@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
 
				 	// Same model, same request
			
 
				 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
			
 
				 	s := InitScheduler(ctx)
			
 
				-	s.getGpuFn = func() gpu.GpuInfoList {
			
 
				-		g := gpu.GpuInfo{Library: "metal"}
			
 
				+	s.getGpuFn = func() discover.GpuInfoList {
			
 
				+		g := discover.GpuInfo{Library: "metal"}
			
 
				 		g.TotalMemory = 24 * format.GigaByte
			
 
				 		g.FreeMemory = 12 * format.GigaByte
			
 
				-		return []gpu.GpuInfo{g}
			
 
				+		return []discover.GpuInfo{g}
			
 
				 	}
			
 
				 	s.newServerFn = scenario1a.newServer
			
 
				 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
			
@@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
 
				 func TestUpdateFreeSpace(t *testing.T) {
			
 
				 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				 	defer done()
			
 
				-	gpus := gpu.GpuInfoList{
			
 
				+	gpus := discover.GpuInfoList{
			
 
				 		{
			
 
				 			Library: "a",
			
 
				 			ID:      "1",
			
@@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 
				 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
			
 
				 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
			
 
				 	defer done()
			
 
				-	gpus := gpu.GpuInfoList{
			
 
				+	gpus := discover.GpuInfoList{
			
 
				 		{
			
 
				 			Library: "cuda",
			
 
				 			ID:      "0",
			
@@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 
				 			ID:      "1",
			
 
				 		},
			
 
				 	}
			
 
				-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
			
 
				+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
			
 
				 
			
 
				 	s := InitScheduler(ctx)
			
 
				 	s.loadedMu.Lock()
			
@@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 
				 	require.Len(t, tmp, 1)
			
 
				 	require.Equal(t, "1", tmp[0].ID)
			
 
				 
			
 
				-	r1.gpus = gpu.GpuInfoList{gpus[1]}
			
 
				+	r1.gpus = discover.GpuInfoList{gpus[1]}
			
 
				 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
			
 
				 	require.Len(t, tmp, 1)
			
 
				 	require.Equal(t, "0", tmp[0].ID)
			
 
				 
			
 
				-	r1.gpus = gpu.GpuInfoList{}
			
 
				+	r1.gpus = discover.GpuInfoList{}
			
 
				 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
			
 
				 	require.Len(t, tmp, 2)
			
 
				 }
			
@@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 
				 	defer done()
			
 
				 	s := InitScheduler(ctx)
			
 
				 
			
 
				-	s.getGpuFn = func() gpu.GpuInfoList {
			
 
				+	s.getGpuFn = func() discover.GpuInfoList {
			
 
				 		// Set memory values to require the model to be spread
			
 
				-		gpus := []gpu.GpuInfo{
			
 
				+		gpus := []discover.GpuInfo{
			
 
				 			{Library: "cuda"},
			
 
				 			{Library: "rocm"},
			
 
				 		}
			
@@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 
				 	}
			
 
				 	s.getCpuFn = getCpuFn
			
 
				 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
			
 
				-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
			
 
				 		require.Len(t, gpus, 1)
			
 
				 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
			
 
				 	}