6 月之前 · 05cd82ef94
--- a/discover/amd_common.go
+++ b/discover/amd_common.go
@@ -1,6 +1,6 @@
 
															 //go:build linux || windows
														
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"errors"
														
--- a/discover/amd_hip_windows.go
+++ b/discover/amd_hip_windows.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"errors"
														
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"bufio"
														
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"bytes"
														
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"os"
														
--- a/discover/cuda_common.go
+++ b/discover/cuda_common.go
@@ -1,6 +1,6 @@
 
															 //go:build linux || windows
														
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"log/slog"
														
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -1,6 +1,6 @@
 
															 //go:build linux || windows
														
 
															-package gpu
														
 
															+package discover
														
 
															 /*
														
 
															 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
														
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -1,6 +1,6 @@
 
															 //go:build darwin
														
 
															-package gpu
														
 
															+package discover
														
 
															 /*
														
 
															 #cgo CFLAGS: -x objective-c
														
--- a/discover/gpu_info.h
+++ b/discover/gpu_info.h
--- a/discover/gpu_info_cudart.c
+++ b/discover/gpu_info_cudart.c
--- a/discover/gpu_info_cudart.h
+++ b/discover/gpu_info_cudart.h
--- a/discover/gpu_info_darwin.h
+++ b/discover/gpu_info_darwin.h
--- a/discover/gpu_info_darwin.m
+++ b/discover/gpu_info_darwin.m
--- a/discover/gpu_info_nvcuda.c
+++ b/discover/gpu_info_nvcuda.c
--- a/discover/gpu_info_nvcuda.h
+++ b/discover/gpu_info_nvcuda.h
--- a/discover/gpu_info_nvml.c
+++ b/discover/gpu_info_nvml.c
--- a/discover/gpu_info_nvml.h
+++ b/discover/gpu_info_nvml.h
--- a/discover/gpu_info_oneapi.c
+++ b/discover/gpu_info_oneapi.c
--- a/discover/gpu_info_oneapi.h
+++ b/discover/gpu_info_oneapi.h
--- a/discover/gpu_linux.go
+++ b/discover/gpu_linux.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"bufio"
														
--- a/discover/gpu_oneapi.go
+++ b/discover/gpu_oneapi.go
@@ -1,6 +1,6 @@
 
															 //go:build linux || windows
														
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"log/slog"
														
--- a/discover/gpu_test.go
+++ b/discover/gpu_test.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"runtime"
														
--- a/discover/gpu_windows.go
+++ b/discover/gpu_windows.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"fmt"
														
--- a/discover/gpu_windows_test.go
+++ b/discover/gpu_windows_test.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import "testing"
														
--- a/discover/types.go
+++ b/discover/types.go
@@ -1,4 +1,4 @@
 
															-package gpu
														
 
															+package discover
														
 
															 import (
														
 
															 	"fmt"
														
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,13 +7,13 @@ import (
 
															 	"strings"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 )
														
 
															 // This algorithm looks for a complete fit to determine if we need to unload other models
														
 
															-func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
														
 
															+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
														
 
															 	// Split up the GPUs by type and try them
														
 
															 	var estimatedVRAM uint64
														
 
															 	for _, gpus := range allGpus.ByLibrary() {
														
@@ -67,7 +67,7 @@ type MemoryEstimate struct {
 
															 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
														
 
															 // The GPUs provided must all be the same Library
														
 
															-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
														
 
															+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
														
 
															 	// Graph size for a partial offload, applies to all GPUs
														
 
															 	var graphPartialOffload uint64
														
@@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 
															 	gpuAllocations := make([]uint64, len(gpus))
														
 
															 	type gs struct {
														
 
															 		i int
														
 
															-		g *gpu.GpuInfo
														
 
															+		g *discover.GpuInfo
														
 
															 	}
														
 
															 	gpusWithSpace := []gs{}
														
 
															 	for i := range gpus {
														
--- a/llm/memory_test.go
+++ b/llm/memory_test.go
@@ -10,7 +10,7 @@ import (
 
															 	"github.com/stretchr/testify/require"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 )
														
 
															 func TestEstimateGPULayers(t *testing.T) {
														
@@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
															 	}
														
 
															 	// Simple CPU scenario
														
 
															-	gpus := []gpu.GpuInfo{
														
 
															+	gpus := []discover.GpuInfo{
														
 
															 		{
														
 
															 			Library: "cpu",
														
 
															 		},
														
@@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
															 	// Dual CUDA scenario with assymetry
														
 
															 	gpuMinimumMemory := uint64(2048)
														
 
															-	gpus = []gpu.GpuInfo{
														
 
															+	gpus = []discover.GpuInfo{
														
 
															 		{
														
 
															 			Library:       "cuda",
														
 
															 			MinimumMemory: gpuMinimumMemory,
														
--- a/llm/server.go
+++ b/llm/server.go
@@ -26,9 +26,9 @@ import (
 
															 	"github.com/ollama/ollama/api"
														
 
															 	"github.com/ollama/ollama/build"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 	"github.com/ollama/ollama/llama"
														
 
															 	"github.com/ollama/ollama/runners"
														
 
															 )
														
@@ -61,8 +61,8 @@ type llmServer struct {
 
															 	estimate    MemoryEstimate
														
 
															 	totalLayers uint64
														
 
															 	// gpuCount     int
														
 
															-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
														
 
															-	loadDuration time.Duration   // Record how long it took the model to load
														
 
															+	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
														
 
															+	loadDuration time.Duration        // Record how long it took the model to load
														
 
															 	loadProgress float32
														
 
															 	sem *semaphore.Weighted
														
@@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
															 // NewLlamaServer will run a server for the given GPUs
														
 
															 // The gpu list must be a single family.
														
 
															-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
														
 
															+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
														
 
															 	var err error
														
 
															 	var cpuRunner string
														
 
															 	var estimate MemoryEstimate
														
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 	var systemFreeMemory uint64
														
 
															 	var systemSwapFreeMemory uint64
														
 
															-	systemInfo := gpu.GetSystemInfo()
														
 
															+	systemInfo := discover.GetSystemInfo()
														
 
															 	systemTotalMemory = systemInfo.System.TotalMemory
														
 
															 	systemFreeMemory = systemInfo.System.FreeMemory
														
 
															 	systemSwapFreeMemory = systemInfo.System.FreeSwap
														
@@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
														
 
															 	if opts.NumGPU == 0 {
														
 
															-		gpus = gpu.GetCPUInfo()
														
 
															+		gpus = discover.GetCPUInfo()
														
 
															 	}
														
 
															 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
														
 
															 		cpuRunner = runners.ServerForCpu()
														
@@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 		case gpus[0].Library != "metal" && estimate.Layers == 0:
														
 
															 			// Don't bother loading into the GPU if no layers can fit
														
 
															 			cpuRunner = runners.ServerForCpu()
														
 
															-			gpus = gpu.GetCPUInfo()
														
 
															+			gpus = discover.GetCPUInfo()
														
 
															 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
														
 
															 			opts.NumGPU = estimate.Layers
														
 
															 		}
														
@@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
															 		}
														
 
															 		if strings.HasPrefix(servers[i], "cpu") {
														
 
															-			gpus = gpu.GetCPUInfo()
														
 
															+			gpus = discover.GetCPUInfo()
														
 
															 		}
														
 
															 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
														
--- a/runners/common.go
+++ b/runners/common.go
@@ -18,8 +18,8 @@ import (
 
															 	"golang.org/x/sync/errgroup"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/envconfig"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 )
														
 
															 const (
														
@@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 
															 // serversForGpu returns a list of compatible servers give the provided GPU
														
 
															 // info, ordered by performance. assumes Init() has been called
														
 
															 // TODO - switch to metadata based mapping
														
 
															-func ServersForGpu(info gpu.GpuInfo) []string {
														
 
															+func ServersForGpu(info discover.GpuInfo) []string {
														
 
															 	// glob workDir for files that start with ollama_
														
 
															 	availableServers := GetAvailableServers(runnersDir)
														
 
															 	requested := info.Library
														
 
															-	if info.Variant != gpu.CPUCapabilityNone.String() {
														
 
															+	if info.Variant != discover.CPUCapabilityNone.String() {
														
 
															 		requested += "_" + info.Variant
														
 
															 	}
														
@@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
 
															 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
														
 
															 		// Load up the best CPU variant if not primary requested
														
 
															 		if info.Library != "cpu" {
														
 
															-			variant := gpu.GetCPUCapability()
														
 
															+			variant := discover.GetCPUCapability()
														
 
															 			// If no variant, then we fall back to default
														
 
															 			// If we have a variant, try that if we find an exact match
														
 
															 			// Attempting to run the wrong CPU instructions will panic the
														
 
															 			// process
														
 
															-			if variant != gpu.CPUCapabilityNone {
														
 
															+			if variant != discover.CPUCapabilityNone {
														
 
															 				for cmp := range availableServers {
														
 
															 					if cmp == "cpu_"+variant.String() {
														
 
															 						servers = append(servers, cmp)
														
@@ -371,9 +371,9 @@ func ServerForCpu() string {
 
															 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
														
 
															 		return "metal"
														
 
															 	}
														
 
															-	variant := gpu.GetCPUCapability()
														
 
															+	variant := discover.GetCPUCapability()
														
 
															 	availableServers := GetAvailableServers(runnersDir)
														
 
															-	if variant != gpu.CPUCapabilityNone {
														
 
															+	if variant != discover.CPUCapabilityNone {
														
 
															 		for cmp := range availableServers {
														
 
															 			if cmp == "cpu_"+variant.String() {
														
 
															 				return cmp
														
--- a/server/routes.go
+++ b/server/routes.go
@@ -27,8 +27,8 @@ import (
 
															 	"github.com/ollama/ollama/api"
														
 
															 	"github.com/ollama/ollama/build"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/envconfig"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 	"github.com/ollama/ollama/llm"
														
 
															 	"github.com/ollama/ollama/openai"
														
 
															 	"github.com/ollama/ollama/parser"
														
@@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
 
															 	// At startup we retrieve GPU information so we can get log messages before loading a model
														
 
															 	// This will log warnings to the log in case we have problems with detected GPUs
														
 
															-	gpus := gpu.GetGPUInfo()
														
 
															+	gpus := discover.GetGPUInfo()
														
 
															 	gpus.LogDetails()
														
 
															 	err = srvr.Serve(ln)
														
--- a/server/routes_generate_test.go
+++ b/server/routes_generate_test.go
@@ -15,7 +15,7 @@ import (
 
															 	"github.com/google/go-cmp/cmp"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/llm"
														
 
															 )
														
@@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 
															 	return
														
 
															 }
														
 
															-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
														
 
															-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
														
 
															+	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 		return mock, nil
														
 
															 	}
														
 
															 }
														
@@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
 
															 			unloadedCh:    make(chan any, 1),
														
 
															 			loaded:        make(map[string]*runnerRef),
														
 
															 			newServerFn:   newMockServer(&mock),
														
 
															-			getGpuFn:      gpu.GetGPUInfo,
														
 
															-			getCpuFn:      gpu.GetCPUInfo,
														
 
															+			getGpuFn:      discover.GetGPUInfo,
														
 
															+			getCpuFn:      discover.GetCPUInfo,
														
 
															 			reschedDelay:  250 * time.Millisecond,
														
 
															-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
														
 
															+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
														
 
															 				// add small delay to simulate loading
														
 
															 				time.Sleep(time.Millisecond)
														
 
															 				req.successCh <- &runnerRef{
														
@@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
 
															 			unloadedCh:    make(chan any, 1),
														
 
															 			loaded:        make(map[string]*runnerRef),
														
 
															 			newServerFn:   newMockServer(&mock),
														
 
															-			getGpuFn:      gpu.GetGPUInfo,
														
 
															-			getCpuFn:      gpu.GetCPUInfo,
														
 
															+			getGpuFn:      discover.GetGPUInfo,
														
 
															+			getCpuFn:      discover.GetCPUInfo,
														
 
															 			reschedDelay:  250 * time.Millisecond,
														
 
															-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
														
 
															+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
														
 
															 				// add small delay to simulate loading
														
 
															 				time.Sleep(time.Millisecond)
														
 
															 				req.successCh <- &runnerRef{
														
--- a/server/sched.go
+++ b/server/sched.go
@@ -15,9 +15,9 @@ import (
 
															 	"time"
														
 
															 	"github.com/ollama/ollama/api"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/envconfig"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 	"github.com/ollama/ollama/llm"
														
 
															 )
														
@@ -41,10 +41,10 @@ type Scheduler struct {
 
															 	loaded   map[string]*runnerRef
														
 
															 	loadedMu sync.Mutex
														
 
															-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
														
 
															-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
														
 
															-	getGpuFn     func() gpu.GpuInfoList
														
 
															-	getCpuFn     func() gpu.GpuInfoList
														
 
															+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
														
 
															+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
														
 
															+	getGpuFn     func() discover.GpuInfoList
														
 
															+	getCpuFn     func() discover.GpuInfoList
														
 
															 	reschedDelay time.Duration
														
 
															 }
														
@@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 
															 		unloadedCh:    make(chan interface{}, maxQueue),
														
 
															 		loaded:        make(map[string]*runnerRef),
														
 
															 		newServerFn:   llm.NewLlamaServer,
														
 
															-		getGpuFn:      gpu.GetGPUInfo,
														
 
															-		getCpuFn:      gpu.GetCPUInfo,
														
 
															+		getGpuFn:      discover.GetGPUInfo,
														
 
															+		getCpuFn:      discover.GetCPUInfo,
														
 
															 		reschedDelay:  250 * time.Millisecond,
														
 
															 	}
														
 
															 	sched.loadFn = sched.load
														
@@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 
															 				} else {
														
 
															 					// Either no models are loaded or below envconfig.MaxRunners
														
 
															 					// Get a refreshed GPU list
														
 
															-					var gpus gpu.GpuInfoList
														
 
															+					var gpus discover.GpuInfoList
														
 
															 					if pending.opts.NumGPU == 0 {
														
 
															 						gpus = s.getCpuFn()
														
 
															 					} else {
														
@@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 
															 	}()
														
 
															 }
														
 
															-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
														
 
															+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
														
 
															 	if numParallel < 1 {
														
 
															 		numParallel = 1
														
 
															 	}
														
@@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 
															 	}()
														
 
															 }
														
 
															-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
														
 
															+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
														
 
															 	type predKey struct {
														
 
															 		Library string
														
 
															 		ID      string
														
@@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 
															 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
														
 
															 // This routine returns the set of GPUs that do not have an active loading model.
														
 
															 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
														
 
															-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
														
 
															-	ret := append(gpu.GpuInfoList{}, allGpus...)
														
 
															+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
														
 
															+	ret := append(discover.GpuInfoList{}, allGpus...)
														
 
															 	s.loadedMu.Lock()
														
 
															 	defer s.loadedMu.Unlock()
														
 
															 	for _, runner := range s.loaded {
														
@@ -541,8 +541,8 @@ type runnerRef struct {
 
															 	// unloading bool      // set to true when we are trying to unload the runner
														
 
															 	llama          llm.LlamaServer
														
 
															-	loading        bool            // True only during initial load, then false forever
														
 
															-	gpus           gpu.GpuInfoList // Recorded at time of provisioning
														
 
															+	loading        bool                 // True only during initial load, then false forever
														
 
															+	gpus           discover.GpuInfoList // Recorded at time of provisioning
														
 
															 	estimatedVRAM  uint64
														
 
															 	estimatedTotal uint64
														
@@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 
															 	start := time.Now()
														
 
															 	// Establish a baseline before we unload
														
 
															-	gpusBefore := gpu.GetGPUInfo()
														
 
															+	gpusBefore := discover.GetGPUInfo()
														
 
															 	var totalMemoryBefore, freeMemoryBefore uint64
														
 
															 	for _, gpu := range gpusBefore {
														
 
															 		totalMemoryBefore += gpu.TotalMemory
														
@@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 
															 			}
														
 
															 			// Query GPUs, look for free to go back up
														
 
															-			gpusNow := gpu.GetGPUInfo()
														
 
															+			gpusNow := discover.GetGPUInfo()
														
 
															 			var totalMemoryNow, freeMemoryNow uint64
														
 
															 			for _, gpu := range gpusNow {
														
 
															 				totalMemoryNow += gpu.TotalMemory
														
@@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
 
															 // If the model can not be fit fully within the available GPU(s) nil is returned
														
 
															 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
														
 
															 // opts.NumCtx accordingly
														
 
															-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
														
 
															+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
														
 
															 	var estimatedVRAM uint64
														
 
															 	var numParallelToTry []int
														
@@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
															 	for _, gl := range gpus.ByLibrary() {
														
 
															 		var ok bool
														
 
															-		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
														
 
															+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
														
 
															 		// TODO - potentially sort by performance capability, existing models loaded, etc.
														
 
															 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
														
 
															 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
														
 
															-		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
														
 
															+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
														
 
															 		// First attempt to fit the model into a single GPU
														
 
															 		for _, p := range numParallelToTry {
														
 
															 			req.opts.NumCtx = req.origNumCtx * p
														
 
															 			if !envconfig.SchedSpread() {
														
 
															 				for _, g := range sgl {
														
 
															-					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
														
 
															+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
														
 
															 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
														
 
															 						*numParallel = p
														
 
															-						return []gpu.GpuInfo{g}
														
 
															+						return []discover.GpuInfo{g}
														
 
															 					}
														
 
															 				}
														
 
															 			}
														
@@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
															 }
														
 
															 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
														
 
															-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
														
 
															+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
														
 
															 	if *numParallel <= 0 {
														
 
															 		*numParallel = 1
														
 
															 		req.opts.NumCtx = req.origNumCtx
														
@@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
 
															 // If other runners are loaded, make sure the pending request will fit in system memory
														
 
															 // If not, pick a runner to unload, else return nil and the request can be loaded
														
 
															-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
														
 
															+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
														
 
															 	slog.Debug("evaluating if CPU model load will fit in available system memory")
														
 
															 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
														
 
															 	if estimate.TotalSize <= gpus[0].FreeMemory {
														
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -13,8 +13,8 @@ import (
 
															 	"github.com/ollama/ollama/api"
														
 
															 	"github.com/ollama/ollama/app/lifecycle"
														
 
															+	"github.com/ollama/ollama/discover"
														
 
															 	"github.com/ollama/ollama/format"
														
 
															-	"github.com/ollama/ollama/gpu"
														
 
															 	"github.com/ollama/ollama/llm"
														
 
															 )
														
@@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
 
															 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
														
 
															 	}
														
 
															 	// Fail to load model first
														
 
															-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 		return nil, errors.New("something failed to load model blah")
														
 
															 	}
														
 
															-	gpus := gpu.GpuInfoList{}
														
 
															+	gpus := discover.GpuInfoList{}
														
 
															 	s.load(req, ggml, gpus, 0)
														
 
															 	require.Empty(t, req.successCh)
														
 
															 	require.Len(t, req.errCh, 1)
														
@@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
 
															 	require.Contains(t, err.Error(), "this model may be incompatible")
														
 
															 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
														
 
															-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 		return server, nil
														
 
															 	}
														
 
															 	s.load(req, ggml, gpus, 0)
														
@@ -102,7 +102,7 @@ type reqBundle struct {
 
															 	ggml    *llm.GGML
														
 
															 }
														
 
															-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 	return scenario.srv, nil
														
 
															 }
														
@@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 
															 	return b
														
 
															 }
														
 
															-func getGpuFn() gpu.GpuInfoList {
														
 
															-	g := gpu.GpuInfo{Library: "metal"}
														
 
															+func getGpuFn() discover.GpuInfoList {
														
 
															+	g := discover.GpuInfo{Library: "metal"}
														
 
															 	g.TotalMemory = 24 * format.GigaByte
														
 
															 	g.FreeMemory = 12 * format.GigaByte
														
 
															-	return []gpu.GpuInfo{g}
														
 
															+	return []discover.GpuInfo{g}
														
 
															 }
														
 
															-func getCpuFn() gpu.GpuInfoList {
														
 
															-	g := gpu.GpuInfo{Library: "cpu"}
														
 
															+func getCpuFn() discover.GpuInfoList {
														
 
															+	g := discover.GpuInfo{Library: "cpu"}
														
 
															 	g.TotalMemory = 32 * format.GigaByte
														
 
															 	g.FreeMemory = 26 * format.GigaByte
														
 
															-	return []gpu.GpuInfo{g}
														
 
															+	return []discover.GpuInfo{g}
														
 
															 }
														
 
															 func TestRequestsSameModelSameRequest(t *testing.T) {
														
@@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
 
															 	}
														
 
															 	var ggml *llm.GGML
														
 
															-	gpus := gpu.GpuInfoList{}
														
 
															+	gpus := discover.GpuInfoList{}
														
 
															 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
														
 
															-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 		return server, nil
														
 
															 	}
														
 
															 	s.load(req, ggml, gpus, 0)
														
@@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
 
															 	// Same model, same request
														
 
															 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
														
 
															 	s := InitScheduler(ctx)
														
 
															-	s.getGpuFn = func() gpu.GpuInfoList {
														
 
															-		g := gpu.GpuInfo{Library: "metal"}
														
 
															+	s.getGpuFn = func() discover.GpuInfoList {
														
 
															+		g := discover.GpuInfo{Library: "metal"}
														
 
															 		g.TotalMemory = 24 * format.GigaByte
														
 
															 		g.FreeMemory = 12 * format.GigaByte
														
 
															-		return []gpu.GpuInfo{g}
														
 
															+		return []discover.GpuInfo{g}
														
 
															 	}
														
 
															 	s.newServerFn = scenario1a.newServer
														
 
															 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
														
@@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
 
															 func TestUpdateFreeSpace(t *testing.T) {
														
 
															 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
														
 
															 	defer done()
														
 
															-	gpus := gpu.GpuInfoList{
														
 
															+	gpus := discover.GpuInfoList{
														
 
															 		{
														
 
															 			Library: "a",
														
 
															 			ID:      "1",
														
@@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 
															 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
														
 
															 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
														
 
															 	defer done()
														
 
															-	gpus := gpu.GpuInfoList{
														
 
															+	gpus := discover.GpuInfoList{
														
 
															 		{
														
 
															 			Library: "cuda",
														
 
															 			ID:      "0",
														
@@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 
															 			ID:      "1",
														
 
															 		},
														
 
															 	}
														
 
															-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
														
 
															+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
														
 
															 	s := InitScheduler(ctx)
														
 
															 	s.loadedMu.Lock()
														
@@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 
															 	require.Len(t, tmp, 1)
														
 
															 	require.Equal(t, "1", tmp[0].ID)
														
 
															-	r1.gpus = gpu.GpuInfoList{gpus[1]}
														
 
															+	r1.gpus = discover.GpuInfoList{gpus[1]}
														
 
															 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
														
 
															 	require.Len(t, tmp, 1)
														
 
															 	require.Equal(t, "0", tmp[0].ID)
														
 
															-	r1.gpus = gpu.GpuInfoList{}
														
 
															+	r1.gpus = discover.GpuInfoList{}
														
 
															 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
														
 
															 	require.Len(t, tmp, 2)
														
 
															 }
														
@@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 
															 	defer done()
														
 
															 	s := InitScheduler(ctx)
														
 
															-	s.getGpuFn = func() gpu.GpuInfoList {
														
 
															+	s.getGpuFn = func() discover.GpuInfoList {
														
 
															 		// Set memory values to require the model to be spread
														
 
															-		gpus := []gpu.GpuInfo{
														
 
															+		gpus := []discover.GpuInfo{
														
 
															 			{Library: "cuda"},
														
 
															 			{Library: "rocm"},
														
 
															 		}
														
@@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 
															 	}
														
 
															 	s.getCpuFn = getCpuFn
														
 
															 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
														
 
															-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
														
 
															 		require.Len(t, gpus, 1)
														
 
															 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
														
 
															 	}