浏览代码

Rename gpu package discover (#7143)

Cleaning up go package naming
Daniel Hiltgen 6 月之前
父节点
当前提交
05cd82ef94

+ 1 - 1
gpu/amd_common.go → discover/amd_common.go

@@ -1,6 +1,6 @@
 //go:build linux || windows
 //go:build linux || windows
 
 
-package gpu
+package discover
 
 
 import (
 import (
 	"errors"
 	"errors"

+ 1 - 1
gpu/amd_hip_windows.go → discover/amd_hip_windows.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"errors"
 	"errors"

+ 1 - 1
gpu/amd_linux.go → discover/amd_linux.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"bufio"
 	"bufio"

+ 1 - 1
gpu/amd_windows.go → discover/amd_windows.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"bytes"
 	"bytes"

+ 1 - 1
gpu/cpu_common.go → discover/cpu_common.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"os"
 	"os"

+ 1 - 1
gpu/cuda_common.go → discover/cuda_common.go

@@ -1,6 +1,6 @@
 //go:build linux || windows
 //go:build linux || windows
 
 
-package gpu
+package discover
 
 
 import (
 import (
 	"log/slog"
 	"log/slog"

+ 1 - 1
gpu/gpu.go → discover/gpu.go

@@ -1,6 +1,6 @@
 //go:build linux || windows
 //go:build linux || windows
 
 
-package gpu
+package discover
 
 
 /*
 /*
 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm
 #cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm

+ 1 - 1
gpu/gpu_darwin.go → discover/gpu_darwin.go

@@ -1,6 +1,6 @@
 //go:build darwin
 //go:build darwin
 
 
-package gpu
+package discover
 
 
 /*
 /*
 #cgo CFLAGS: -x objective-c
 #cgo CFLAGS: -x objective-c

+ 0 - 0
gpu/gpu_info.h → discover/gpu_info.h


+ 0 - 0
gpu/gpu_info_cudart.c → discover/gpu_info_cudart.c


+ 0 - 0
gpu/gpu_info_cudart.h → discover/gpu_info_cudart.h


+ 0 - 0
gpu/gpu_info_darwin.h → discover/gpu_info_darwin.h


+ 0 - 0
gpu/gpu_info_darwin.m → discover/gpu_info_darwin.m


+ 0 - 0
gpu/gpu_info_nvcuda.c → discover/gpu_info_nvcuda.c


+ 0 - 0
gpu/gpu_info_nvcuda.h → discover/gpu_info_nvcuda.h


+ 0 - 0
gpu/gpu_info_nvml.c → discover/gpu_info_nvml.c


+ 0 - 0
gpu/gpu_info_nvml.h → discover/gpu_info_nvml.h


+ 0 - 0
gpu/gpu_info_oneapi.c → discover/gpu_info_oneapi.c


+ 0 - 0
gpu/gpu_info_oneapi.h → discover/gpu_info_oneapi.h


+ 1 - 1
gpu/gpu_linux.go → discover/gpu_linux.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"bufio"
 	"bufio"

+ 1 - 1
gpu/gpu_oneapi.go → discover/gpu_oneapi.go

@@ -1,6 +1,6 @@
 //go:build linux || windows
 //go:build linux || windows
 
 
-package gpu
+package discover
 
 
 import (
 import (
 	"log/slog"
 	"log/slog"

+ 1 - 1
gpu/gpu_test.go → discover/gpu_test.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"runtime"
 	"runtime"

+ 1 - 1
gpu/gpu_windows.go → discover/gpu_windows.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"fmt"
 	"fmt"

+ 1 - 1
gpu/gpu_windows_test.go → discover/gpu_windows_test.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import "testing"
 import "testing"
 
 

+ 1 - 1
gpu/types.go → discover/types.go

@@ -1,4 +1,4 @@
-package gpu
+package discover
 
 
 import (
 import (
 	"fmt"
 	"fmt"

+ 4 - 4
llm/memory.go

@@ -7,13 +7,13 @@ import (
 	"strings"
 	"strings"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 )
 )
 
 
 // This algorithm looks for a complete fit to determine if we need to unload other models
 // This algorithm looks for a complete fit to determine if we need to unload other models
-func PredictServerFit(allGpus gpu.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
+func PredictServerFit(allGpus discover.GpuInfoList, ggml *GGML, adapters, projectors []string, opts api.Options) (bool, uint64) {
 	// Split up the GPUs by type and try them
 	// Split up the GPUs by type and try them
 	var estimatedVRAM uint64
 	var estimatedVRAM uint64
 	for _, gpus := range allGpus.ByLibrary() {
 	for _, gpus := range allGpus.ByLibrary() {
@@ -67,7 +67,7 @@ type MemoryEstimate struct {
 
 
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // Given a model and one or more GPU targets, predict how many layers and bytes we can load, and the total size
 // The GPUs provided must all be the same Library
 // The GPUs provided must all be the same Library
-func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
+func EstimateGPULayers(gpus []discover.GpuInfo, ggml *GGML, projectors []string, opts api.Options) MemoryEstimate {
 	// Graph size for a partial offload, applies to all GPUs
 	// Graph size for a partial offload, applies to all GPUs
 	var graphPartialOffload uint64
 	var graphPartialOffload uint64
 
 
@@ -157,7 +157,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	gpuAllocations := make([]uint64, len(gpus))
 	gpuAllocations := make([]uint64, len(gpus))
 	type gs struct {
 	type gs struct {
 		i int
 		i int
-		g *gpu.GpuInfo
+		g *discover.GpuInfo
 	}
 	}
 	gpusWithSpace := []gs{}
 	gpusWithSpace := []gs{}
 	for i := range gpus {
 	for i := range gpus {

+ 3 - 3
llm/memory_test.go

@@ -10,7 +10,7 @@ import (
 	"github.com/stretchr/testify/require"
 	"github.com/stretchr/testify/require"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 )
 )
 
 
 func TestEstimateGPULayers(t *testing.T) {
 func TestEstimateGPULayers(t *testing.T) {
@@ -50,7 +50,7 @@ func TestEstimateGPULayers(t *testing.T) {
 	}
 	}
 
 
 	// Simple CPU scenario
 	// Simple CPU scenario
-	gpus := []gpu.GpuInfo{
+	gpus := []discover.GpuInfo{
 		{
 		{
 			Library: "cpu",
 			Library: "cpu",
 		},
 		},
@@ -72,7 +72,7 @@ func TestEstimateGPULayers(t *testing.T) {
 
 
 	// Dual CUDA scenario with assymetry
 	// Dual CUDA scenario with assymetry
 	gpuMinimumMemory := uint64(2048)
 	gpuMinimumMemory := uint64(2048)
-	gpus = []gpu.GpuInfo{
+	gpus = []discover.GpuInfo{
 		{
 		{
 			Library:       "cuda",
 			Library:       "cuda",
 			MinimumMemory: gpuMinimumMemory,
 			MinimumMemory: gpuMinimumMemory,

+ 8 - 8
llm/server.go

@@ -26,9 +26,9 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/llama"
 	"github.com/ollama/ollama/runners"
 	"github.com/ollama/ollama/runners"
 )
 )
@@ -61,8 +61,8 @@ type llmServer struct {
 	estimate    MemoryEstimate
 	estimate    MemoryEstimate
 	totalLayers uint64
 	totalLayers uint64
 	// gpuCount     int
 	// gpuCount     int
-	gpus         gpu.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
-	loadDuration time.Duration   // Record how long it took the model to load
+	gpus         discover.GpuInfoList // Recorded just before the model loaded, free space will be incorrect
+	loadDuration time.Duration        // Record how long it took the model to load
 	loadProgress float32
 	loadProgress float32
 
 
 	sem *semaphore.Weighted
 	sem *semaphore.Weighted
@@ -90,7 +90,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
 
 // NewLlamaServer will run a server for the given GPUs
 // NewLlamaServer will run a server for the given GPUs
 // The gpu list must be a single family.
 // The gpu list must be a single family.
-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
+func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
 	var err error
 	var err error
 	var cpuRunner string
 	var cpuRunner string
 	var estimate MemoryEstimate
 	var estimate MemoryEstimate
@@ -98,7 +98,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemFreeMemory uint64
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
 	var systemSwapFreeMemory uint64
 
 
-	systemInfo := gpu.GetSystemInfo()
+	systemInfo := discover.GetSystemInfo()
 	systemTotalMemory = systemInfo.System.TotalMemory
 	systemTotalMemory = systemInfo.System.TotalMemory
 	systemFreeMemory = systemInfo.System.FreeMemory
 	systemFreeMemory = systemInfo.System.FreeMemory
 	systemSwapFreeMemory = systemInfo.System.FreeSwap
 	systemSwapFreeMemory = systemInfo.System.FreeSwap
@@ -106,7 +106,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
 
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
 	if opts.NumGPU == 0 {
-		gpus = gpu.GetCPUInfo()
+		gpus = discover.GetCPUInfo()
 	}
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		cpuRunner = runners.ServerForCpu()
 		cpuRunner = runners.ServerForCpu()
@@ -122,7 +122,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 		case gpus[0].Library != "metal" && estimate.Layers == 0:
 			// Don't bother loading into the GPU if no layers can fit
 			// Don't bother loading into the GPU if no layers can fit
 			cpuRunner = runners.ServerForCpu()
 			cpuRunner = runners.ServerForCpu()
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 		case opts.NumGPU < 0 && estimate.Layers > 0 && gpus[0].Library != "cpu":
 			opts.NumGPU = estimate.Layers
 			opts.NumGPU = estimate.Layers
 		}
 		}
@@ -281,7 +281,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		}
 		}
 
 
 		if strings.HasPrefix(servers[i], "cpu") {
 		if strings.HasPrefix(servers[i], "cpu") {
-			gpus = gpu.GetCPUInfo()
+			gpus = discover.GetCPUInfo()
 		}
 		}
 
 
 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
 		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race

+ 7 - 7
runners/common.go

@@ -18,8 +18,8 @@ import (
 
 
 	"golang.org/x/sync/errgroup"
 	"golang.org/x/sync/errgroup"
 
 
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
 )
 )
 
 
 const (
 const (
@@ -301,11 +301,11 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 // serversForGpu returns a list of compatible servers give the provided GPU
 // serversForGpu returns a list of compatible servers give the provided GPU
 // info, ordered by performance. assumes Init() has been called
 // info, ordered by performance. assumes Init() has been called
 // TODO - switch to metadata based mapping
 // TODO - switch to metadata based mapping
-func ServersForGpu(info gpu.GpuInfo) []string {
+func ServersForGpu(info discover.GpuInfo) []string {
 	// glob workDir for files that start with ollama_
 	// glob workDir for files that start with ollama_
 	availableServers := GetAvailableServers(runnersDir)
 	availableServers := GetAvailableServers(runnersDir)
 	requested := info.Library
 	requested := info.Library
-	if info.Variant != gpu.CPUCapabilityNone.String() {
+	if info.Variant != discover.CPUCapabilityNone.String() {
 		requested += "_" + info.Variant
 		requested += "_" + info.Variant
 	}
 	}
 
 
@@ -341,12 +341,12 @@ func ServersForGpu(info gpu.GpuInfo) []string {
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
 		// Load up the best CPU variant if not primary requested
 		// Load up the best CPU variant if not primary requested
 		if info.Library != "cpu" {
 		if info.Library != "cpu" {
-			variant := gpu.GetCPUCapability()
+			variant := discover.GetCPUCapability()
 			// If no variant, then we fall back to default
 			// If no variant, then we fall back to default
 			// If we have a variant, try that if we find an exact match
 			// If we have a variant, try that if we find an exact match
 			// Attempting to run the wrong CPU instructions will panic the
 			// Attempting to run the wrong CPU instructions will panic the
 			// process
 			// process
-			if variant != gpu.CPUCapabilityNone {
+			if variant != discover.CPUCapabilityNone {
 				for cmp := range availableServers {
 				for cmp := range availableServers {
 					if cmp == "cpu_"+variant.String() {
 					if cmp == "cpu_"+variant.String() {
 						servers = append(servers, cmp)
 						servers = append(servers, cmp)
@@ -371,9 +371,9 @@ func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		return "metal"
 		return "metal"
 	}
 	}
-	variant := gpu.GetCPUCapability()
+	variant := discover.GetCPUCapability()
 	availableServers := GetAvailableServers(runnersDir)
 	availableServers := GetAvailableServers(runnersDir)
-	if variant != gpu.CPUCapabilityNone {
+	if variant != discover.CPUCapabilityNone {
 		for cmp := range availableServers {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 				return cmp

+ 2 - 2
server/routes.go

@@ -27,8 +27,8 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/build"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/openai"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/parser"
@@ -1235,7 +1235,7 @@ func Serve(ln net.Listener) error {
 
 
 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// At startup we retrieve GPU information so we can get log messages before loading a model
 	// This will log warnings to the log in case we have problems with detected GPUs
 	// This will log warnings to the log in case we have problems with detected GPUs
-	gpus := gpu.GetGPUInfo()
+	gpus := discover.GetGPUInfo()
 	gpus.LogDetails()
 	gpus.LogDetails()
 
 
 	err = srvr.Serve(ln)
 	err = srvr.Serve(ln)

+ 9 - 9
server/routes_generate_test.go

@@ -15,7 +15,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/gpu"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -41,8 +41,8 @@ func (mockRunner) Tokenize(_ context.Context, s string) (tokens []int, err error
 	return
 	return
 }
 }
 
 
-func newMockServer(mock *mockRunner) func(gpu.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
-	return func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func newMockServer(mock *mockRunner) func(discover.GpuInfoList, string, *llm.GGML, []string, []string, api.Options, int) (llm.LlamaServer, error) {
+	return func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, projectors, system []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return mock, nil
 		return mock, nil
 	}
 	}
 }
 }
@@ -69,10 +69,10 @@ func TestGenerateChat(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{
@@ -367,10 +367,10 @@ func TestGenerate(t *testing.T) {
 			unloadedCh:    make(chan any, 1),
 			unloadedCh:    make(chan any, 1),
 			loaded:        make(map[string]*runnerRef),
 			loaded:        make(map[string]*runnerRef),
 			newServerFn:   newMockServer(&mock),
 			newServerFn:   newMockServer(&mock),
-			getGpuFn:      gpu.GetGPUInfo,
-			getCpuFn:      gpu.GetCPUInfo,
+			getGpuFn:      discover.GetGPUInfo,
+			getCpuFn:      discover.GetCPUInfo,
 			reschedDelay:  250 * time.Millisecond,
 			reschedDelay:  250 * time.Millisecond,
-			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+			loadFn: func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 				// add small delay to simulate loading
 				// add small delay to simulate loading
 				time.Sleep(time.Millisecond)
 				time.Sleep(time.Millisecond)
 				req.successCh <- &runnerRef{
 				req.successCh <- &runnerRef{

+ 23 - 23
server/sched.go

@@ -15,9 +15,9 @@ import (
 	"time"
 	"time"
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -41,10 +41,10 @@ type Scheduler struct {
 	loaded   map[string]*runnerRef
 	loaded   map[string]*runnerRef
 	loadedMu sync.Mutex
 	loadedMu sync.Mutex
 
 
-	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int)
-	newServerFn  func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
-	getGpuFn     func() gpu.GpuInfoList
-	getCpuFn     func() gpu.GpuInfoList
+	loadFn       func(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int)
+	newServerFn  func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
+	getGpuFn     func() discover.GpuInfoList
+	getCpuFn     func() discover.GpuInfoList
 	reschedDelay time.Duration
 	reschedDelay time.Duration
 }
 }
 
 
@@ -69,8 +69,8 @@ func InitScheduler(ctx context.Context) *Scheduler {
 		unloadedCh:    make(chan interface{}, maxQueue),
 		unloadedCh:    make(chan interface{}, maxQueue),
 		loaded:        make(map[string]*runnerRef),
 		loaded:        make(map[string]*runnerRef),
 		newServerFn:   llm.NewLlamaServer,
 		newServerFn:   llm.NewLlamaServer,
-		getGpuFn:      gpu.GetGPUInfo,
-		getCpuFn:      gpu.GetCPUInfo,
+		getGpuFn:      discover.GetGPUInfo,
+		getCpuFn:      discover.GetCPUInfo,
 		reschedDelay:  250 * time.Millisecond,
 		reschedDelay:  250 * time.Millisecond,
 	}
 	}
 	sched.loadFn = sched.load
 	sched.loadFn = sched.load
@@ -157,7 +157,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
 				} else {
 				} else {
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Either no models are loaded or below envconfig.MaxRunners
 					// Get a refreshed GPU list
 					// Get a refreshed GPU list
-					var gpus gpu.GpuInfoList
+					var gpus discover.GpuInfoList
 					if pending.opts.NumGPU == 0 {
 					if pending.opts.NumGPU == 0 {
 						gpus = s.getCpuFn()
 						gpus = s.getCpuFn()
 					} else {
 					} else {
@@ -409,7 +409,7 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
 	}()
 	}()
 }
 }
 
 
-func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel int) {
+func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel int) {
 	if numParallel < 1 {
 	if numParallel < 1 {
 		numParallel = 1
 		numParallel = 1
 	}
 	}
@@ -470,7 +470,7 @@ func (s *Scheduler) load(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList,
 	}()
 	}()
 }
 }
 
 
-func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
+func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
 	type predKey struct {
 	type predKey struct {
 		Library string
 		Library string
 		ID      string
 		ID      string
@@ -513,8 +513,8 @@ func (s *Scheduler) updateFreeSpace(allGpus gpu.GpuInfoList) {
 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
 // to avoid scheduling another model on the same GPU(s) that haven't stabilized.
 // This routine returns the set of GPUs that do not have an active loading model.
 // This routine returns the set of GPUs that do not have an active loading model.
 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
 // If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
-func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus gpu.GpuInfoList) gpu.GpuInfoList {
-	ret := append(gpu.GpuInfoList{}, allGpus...)
+func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
+	ret := append(discover.GpuInfoList{}, allGpus...)
 	s.loadedMu.Lock()
 	s.loadedMu.Lock()
 	defer s.loadedMu.Unlock()
 	defer s.loadedMu.Unlock()
 	for _, runner := range s.loaded {
 	for _, runner := range s.loaded {
@@ -541,8 +541,8 @@ type runnerRef struct {
 	// unloading bool      // set to true when we are trying to unload the runner
 	// unloading bool      // set to true when we are trying to unload the runner
 
 
 	llama          llm.LlamaServer
 	llama          llm.LlamaServer
-	loading        bool            // True only during initial load, then false forever
-	gpus           gpu.GpuInfoList // Recorded at time of provisioning
+	loading        bool                 // True only during initial load, then false forever
+	gpus           discover.GpuInfoList // Recorded at time of provisioning
 	estimatedVRAM  uint64
 	estimatedVRAM  uint64
 	estimatedTotal uint64
 	estimatedTotal uint64
 
 
@@ -630,7 +630,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 	start := time.Now()
 	start := time.Now()
 
 
 	// Establish a baseline before we unload
 	// Establish a baseline before we unload
-	gpusBefore := gpu.GetGPUInfo()
+	gpusBefore := discover.GetGPUInfo()
 	var totalMemoryBefore, freeMemoryBefore uint64
 	var totalMemoryBefore, freeMemoryBefore uint64
 	for _, gpu := range gpusBefore {
 	for _, gpu := range gpusBefore {
 		totalMemoryBefore += gpu.TotalMemory
 		totalMemoryBefore += gpu.TotalMemory
@@ -648,7 +648,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan interface{} {
 			}
 			}
 
 
 			// Query GPUs, look for free to go back up
 			// Query GPUs, look for free to go back up
-			gpusNow := gpu.GetGPUInfo()
+			gpusNow := discover.GetGPUInfo()
 			var totalMemoryNow, freeMemoryNow uint64
 			var totalMemoryNow, freeMemoryNow uint64
 			for _, gpu := range gpusNow {
 			for _, gpu := range gpusNow {
 				totalMemoryNow += gpu.TotalMemory
 				totalMemoryNow += gpu.TotalMemory
@@ -685,7 +685,7 @@ func (a ByDuration) Less(i, j int) bool {
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If the model can not be fit fully within the available GPU(s) nil is returned
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // If numParallel is <= 0, this will attempt try to optimize parallism based on available VRAM, and adjust
 // opts.NumCtx accordingly
 // opts.NumCtx accordingly
-func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	var estimatedVRAM uint64
 	var estimatedVRAM uint64
 
 
 	var numParallelToTry []int
 	var numParallelToTry []int
@@ -698,22 +698,22 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 
 
 	for _, gl := range gpus.ByLibrary() {
 	for _, gl := range gpus.ByLibrary() {
 		var ok bool
 		var ok bool
-		sgl := append(make(gpu.GpuInfoList, 0, len(gl)), gl...)
+		sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
 
 
 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - potentially sort by performance capability, existing models loaded, etc.
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 		// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
 		// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
-		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
+		sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
 
 
 		// First attempt to fit the model into a single GPU
 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
 			if !envconfig.SchedSpread() {
 			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 				for _, g := range sgl {
-					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
+					if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						*numParallel = p
 						*numParallel = p
-						return []gpu.GpuInfo{g}
+						return []discover.GpuInfo{g}
 					}
 					}
 				}
 				}
 			}
 			}
@@ -737,7 +737,7 @@ func pickBestFullFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoL
 }
 }
 
 
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
 // If multiple Libraries are detected, pick the Library which loads the most layers for the model
-func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numParallel *int) gpu.GpuInfoList {
+func pickBestPartialFitByLibrary(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
 	if *numParallel <= 0 {
 	if *numParallel <= 0 {
 		*numParallel = 1
 		*numParallel = 1
 		req.opts.NumCtx = req.origNumCtx
 		req.opts.NumCtx = req.origNumCtx
@@ -822,7 +822,7 @@ func (s *Scheduler) expireRunner(model *Model) {
 
 
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If other runners are loaded, make sure the pending request will fit in system memory
 // If not, pick a runner to unload, else return nil and the request can be loaded
 // If not, pick a runner to unload, else return nil and the request can be loaded
-func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) *runnerRef {
+func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, ggml *llm.GGML, gpus discover.GpuInfoList) *runnerRef {
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	slog.Debug("evaluating if CPU model load will fit in available system memory")
 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	estimate := llm.EstimateGPULayers(gpus, ggml, req.model.ProjectorPaths, req.opts)
 	if estimate.TotalSize <= gpus[0].FreeMemory {
 	if estimate.TotalSize <= gpus[0].FreeMemory {

+ 24 - 24
server/sched_test.go

@@ -13,8 +13,8 @@ import (
 
 
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/app/lifecycle"
 	"github.com/ollama/ollama/app/lifecycle"
+	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/format"
-	"github.com/ollama/ollama/gpu"
 	"github.com/ollama/ollama/llm"
 	"github.com/ollama/ollama/llm"
 )
 )
 
 
@@ -47,10 +47,10 @@ func TestLoad(t *testing.T) {
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 		sessionDuration: &api.Duration{Duration: 2 * time.Second},
 	}
 	}
 	// Fail to load model first
 	// Fail to load model first
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return nil, errors.New("something failed to load model blah")
 		return nil, errors.New("something failed to load model blah")
 	}
 	}
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	s.load(req, ggml, gpus, 0)
 	s.load(req, ggml, gpus, 0)
 	require.Empty(t, req.successCh)
 	require.Empty(t, req.successCh)
 	require.Len(t, req.errCh, 1)
 	require.Len(t, req.errCh, 1)
@@ -61,7 +61,7 @@ func TestLoad(t *testing.T) {
 	require.Contains(t, err.Error(), "this model may be incompatible")
 	require.Contains(t, err.Error(), "this model may be incompatible")
 
 
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
 	s.load(req, ggml, gpus, 0)
 	s.load(req, ggml, gpus, 0)
@@ -102,7 +102,7 @@ type reqBundle struct {
 	ggml    *llm.GGML
 	ggml    *llm.GGML
 }
 }
 
 
-func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 	return scenario.srv, nil
 	return scenario.srv, nil
 }
 }
 
 
@@ -151,18 +151,18 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
 	return b
 	return b
 }
 }
 
 
-func getGpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "metal"}
+func getGpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "metal"}
 	g.TotalMemory = 24 * format.GigaByte
 	g.TotalMemory = 24 * format.GigaByte
 	g.FreeMemory = 12 * format.GigaByte
 	g.FreeMemory = 12 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }
 }
 
 
-func getCpuFn() gpu.GpuInfoList {
-	g := gpu.GpuInfo{Library: "cpu"}
+func getCpuFn() discover.GpuInfoList {
+	g := discover.GpuInfo{Library: "cpu"}
 	g.TotalMemory = 32 * format.GigaByte
 	g.TotalMemory = 32 * format.GigaByte
 	g.FreeMemory = 26 * format.GigaByte
 	g.FreeMemory = 26 * format.GigaByte
-	return []gpu.GpuInfo{g}
+	return []discover.GpuInfo{g}
 }
 }
 
 
 func TestRequestsSameModelSameRequest(t *testing.T) {
 func TestRequestsSameModelSameRequest(t *testing.T) {
@@ -420,9 +420,9 @@ func TestExpireRunner(t *testing.T) {
 	}
 	}
 
 
 	var ggml *llm.GGML
 	var ggml *llm.GGML
-	gpus := gpu.GpuInfoList{}
+	gpus := discover.GpuInfoList{}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
 	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		return server, nil
 		return server, nil
 	}
 	}
 	s.load(req, ggml, gpus, 0)
 	s.load(req, ggml, gpus, 0)
@@ -460,11 +460,11 @@ func TestPrematureExpired(t *testing.T) {
 	// Same model, same request
 	// Same model, same request
 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
 	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
-	s.getGpuFn = func() gpu.GpuInfoList {
-		g := gpu.GpuInfo{Library: "metal"}
+	s.getGpuFn = func() discover.GpuInfoList {
+		g := discover.GpuInfo{Library: "metal"}
 		g.TotalMemory = 24 * format.GigaByte
 		g.TotalMemory = 24 * format.GigaByte
 		g.FreeMemory = 12 * format.GigaByte
 		g.FreeMemory = 12 * format.GigaByte
-		return []gpu.GpuInfo{g}
+		return []discover.GpuInfo{g}
 	}
 	}
 	s.newServerFn = scenario1a.newServer
 	s.newServerFn = scenario1a.newServer
 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
 	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
@@ -530,7 +530,7 @@ func TestUseLoadedRunner(t *testing.T) {
 func TestUpdateFreeSpace(t *testing.T) {
 func TestUpdateFreeSpace(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 		{
 			Library: "a",
 			Library: "a",
 			ID:      "1",
 			ID:      "1",
@@ -563,7 +563,7 @@ func TestUpdateFreeSpace(t *testing.T) {
 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
 	defer done()
 	defer done()
-	gpus := gpu.GpuInfoList{
+	gpus := discover.GpuInfoList{
 		{
 		{
 			Library: "cuda",
 			Library: "cuda",
 			ID:      "0",
 			ID:      "0",
@@ -573,7 +573,7 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 			ID:      "1",
 			ID:      "1",
 		},
 		},
 	}
 	}
-	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}
+	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
 
 
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
 	s.loadedMu.Lock()
 	s.loadedMu.Lock()
@@ -584,12 +584,12 @@ func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
 	require.Len(t, tmp, 1)
 	require.Len(t, tmp, 1)
 	require.Equal(t, "1", tmp[0].ID)
 	require.Equal(t, "1", tmp[0].ID)
 
 
-	r1.gpus = gpu.GpuInfoList{gpus[1]}
+	r1.gpus = discover.GpuInfoList{gpus[1]}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 1)
 	require.Len(t, tmp, 1)
 	require.Equal(t, "0", tmp[0].ID)
 	require.Equal(t, "0", tmp[0].ID)
 
 
-	r1.gpus = gpu.GpuInfoList{}
+	r1.gpus = discover.GpuInfoList{}
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	tmp = s.filterGPUsWithoutLoadingModels(gpus)
 	require.Len(t, tmp, 2)
 	require.Len(t, tmp, 2)
 }
 }
@@ -715,9 +715,9 @@ func TestHomogeneousGPUs(t *testing.T) {
 	defer done()
 	defer done()
 	s := InitScheduler(ctx)
 	s := InitScheduler(ctx)
 
 
-	s.getGpuFn = func() gpu.GpuInfoList {
+	s.getGpuFn = func() discover.GpuInfoList {
 		// Set memory values to require the model to be spread
 		// Set memory values to require the model to be spread
-		gpus := []gpu.GpuInfo{
+		gpus := []discover.GpuInfo{
 			{Library: "cuda"},
 			{Library: "cuda"},
 			{Library: "rocm"},
 			{Library: "rocm"},
 		}
 		}
@@ -729,7 +729,7 @@ func TestHomogeneousGPUs(t *testing.T) {
 	}
 	}
 	s.getCpuFn = getCpuFn
 	s.getCpuFn = getCpuFn
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
 	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
-	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
+	s.newServerFn = func(gpus discover.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
 		require.Len(t, gpus, 1)
 		require.Len(t, gpus, 1)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
 	}
 	}