Michael Yang 10 ay önce
ebeveyn
işleme
55cd3ddcca
8 değiştirilmiş dosya ile 82 ekleme ve 83 silme
  1. 1 1
      cmd/interactive.go
  2. 49 74
      envconfig/config.go
  3. 26 2
      envconfig/config_test.go
  4. 1 1
      gpu/gpu.go
  5. 1 1
      llm/server.go
  6. 2 2
      server/images.go
  7. 1 1
      server/routes.go
  8. 1 1
      server/sched.go

+ 1 - 1
cmd/interactive.go

@@ -157,7 +157,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
 		return err
 		return err
 	}
 	}
 
 
-	if envconfig.NoHistory {
+	if envconfig.NoHistory() {
 		scanner.HistoryDisable()
 		scanner.HistoryDisable()
 	}
 	}
 
 

+ 49 - 74
envconfig/config.go

@@ -17,21 +17,6 @@ import (
 
 
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 var ErrInvalidHostPort = errors.New("invalid port specified in OLLAMA_HOST")
 
 
-// Debug returns true if the OLLAMA_DEBUG environment variable is set to a truthy value.
-func Debug() bool {
-	if s := clean("OLLAMA_DEBUG"); s != "" {
-		b, err := strconv.ParseBool(s)
-		if err != nil {
-			// non-empty value is truthy
-			return true
-		}
-
-		return b
-	}
-
-	return false
-}
-
 // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
 // Host returns the scheme and host. Host can be configured via the OLLAMA_HOST environment variable.
 // Default is scheme "http" and host "127.0.0.1:11434"
 // Default is scheme "http" and host "127.0.0.1:11434"
 func Host() *url.URL {
 func Host() *url.URL {
@@ -77,7 +62,7 @@ func Host() *url.URL {
 
 
 // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
 // Origins returns a list of allowed origins. Origins can be configured via the OLLAMA_ORIGINS environment variable.
 func Origins() (origins []string) {
 func Origins() (origins []string) {
-	if s := clean("OLLAMA_ORIGINS"); s != "" {
+	if s := getenv("OLLAMA_ORIGINS"); s != "" {
 		origins = strings.Split(s, ",")
 		origins = strings.Split(s, ",")
 	}
 	}
 
 
@@ -114,9 +99,37 @@ func Models() string {
 	return filepath.Join(home, ".ollama", "models")
 	return filepath.Join(home, ".ollama", "models")
 }
 }
 
 
+func Bool(k string) func() bool {
+	return func() bool {
+		if s := getenv(k); s != "" {
+			b, err := strconv.ParseBool(s)
+			if err != nil {
+				return true
+			}
+
+			return b
+		}
+
+		return false
+	}
+}
+
+var (
+	// Debug enabled additional debug information.
+	Debug = Bool("OLLAMA_DEBUG")
+	// FlashAttention enables the experimental flash attention feature.
+	FlashAttention = Bool("OLLAMA_FLASH_ATTENTION")
+	// NoHistory disables readline history.
+	NoHistory = Bool("OLLAMA_NOHISTORY")
+	// NoPrune disables pruning of model blobs on startup.
+	NoPrune = Bool("OLLAMA_NOPRUNE")
+	// SchedSpread allows scheduling models across all GPUs.
+	SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
+	// IntelGPU enables experimental Intel GPU detection.
+	IntelGPU = Bool("OLLAMA_INTEL_GPU")
+)
+
 var (
 var (
-	// Experimental flash attention
-	FlashAttention bool
 	// Set via OLLAMA_KEEP_ALIVE in the environment
 	// Set via OLLAMA_KEEP_ALIVE in the environment
 	KeepAlive time.Duration
 	KeepAlive time.Duration
 	// Set via OLLAMA_LLM_LIBRARY in the environment
 	// Set via OLLAMA_LLM_LIBRARY in the environment
@@ -125,22 +138,12 @@ var (
 	MaxRunners int
 	MaxRunners int
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	// Set via OLLAMA_MAX_QUEUE in the environment
 	MaxQueuedRequests int
 	MaxQueuedRequests int
-	// Set via OLLAMA_MODELS in the environment
-	ModelsDir string
-	// Set via OLLAMA_NOHISTORY in the environment
-	NoHistory bool
-	// Set via OLLAMA_NOPRUNE in the environment
-	NoPrune bool
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	// Set via OLLAMA_NUM_PARALLEL in the environment
 	NumParallel int
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
 	RunnersDir string
-	// Set via OLLAMA_SCHED_SPREAD in the environment
-	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
 	TmpDir string
-	// Set via OLLAMA_INTEL_GPU in the environment
-	IntelGpu bool
 
 
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	// Set via CUDA_VISIBLE_DEVICES in the environment
 	CudaVisibleDevices string
 	CudaVisibleDevices string
@@ -163,19 +166,19 @@ type EnvVar struct {
 func AsMap() map[string]EnvVar {
 func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
-		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention, "Enabled flash attention"},
+		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive, "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
 		"OLLAMA_MAX_QUEUE":         {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
 		"OLLAMA_MODELS":            {"OLLAMA_MODELS", Models(), "The path to the models directory"},
-		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
-		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
+		"OLLAMA_NOHISTORY":         {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
+		"OLLAMA_NOPRUNE":           {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
-		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
 	}
 	if runtime.GOOS != "darwin" {
 	if runtime.GOOS != "darwin" {
@@ -184,7 +187,7 @@ func AsMap() map[string]EnvVar {
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["ROCR_VISIBLE_DEVICES"] = EnvVar{"ROCR_VISIBLE_DEVICES", RocrVisibleDevices, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["GPU_DEVICE_ORDINAL"] = EnvVar{"GPU_DEVICE_ORDINAL", GpuDeviceOrdinal, "Set which AMD devices are visible"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
 		ret["HSA_OVERRIDE_GFX_VERSION"] = EnvVar{"HSA_OVERRIDE_GFX_VERSION", HsaOverrideGfxVersion, "Override the gfx used for all detected AMD GPUs"}
-		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGpu, "Enable experimental Intel GPU detection"}
+		ret["OLLAMA_INTEL_GPU"] = EnvVar{"OLLAMA_INTEL_GPU", IntelGPU(), "Enable experimental Intel GPU detection"}
 	}
 	}
 	return ret
 	return ret
 }
 }
@@ -197,8 +200,8 @@ func Values() map[string]string {
 	return vals
 	return vals
 }
 }
 
 
-// Clean quotes and spaces from the value
-func clean(key string) string {
+// getenv returns an environment variable stripped of leading and trailing quotes or spaces
+func getenv(key string) string {
 	return strings.Trim(os.Getenv(key), "\"' ")
 	return strings.Trim(os.Getenv(key), "\"' ")
 }
 }
 
 
@@ -213,14 +216,7 @@ func init() {
 }
 }
 
 
 func LoadConfig() {
 func LoadConfig() {
-	if fa := clean("OLLAMA_FLASH_ATTENTION"); fa != "" {
-		d, err := strconv.ParseBool(fa)
-		if err == nil {
-			FlashAttention = d
-		}
-	}
-
-	RunnersDir = clean("OLLAMA_RUNNERS_DIR")
+	RunnersDir = getenv("OLLAMA_RUNNERS_DIR")
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 	if runtime.GOOS == "windows" && RunnersDir == "" {
 		// On Windows we do not carry the payloads inside the main executable
 		// On Windows we do not carry the payloads inside the main executable
 		appExe, err := os.Executable()
 		appExe, err := os.Executable()
@@ -256,11 +252,11 @@ func LoadConfig() {
 		}
 		}
 	}
 	}
 
 
-	TmpDir = clean("OLLAMA_TMPDIR")
+	TmpDir = getenv("OLLAMA_TMPDIR")
 
 
-	LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
+	LLMLibrary = getenv("OLLAMA_LLM_LIBRARY")
 
 
-	if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
+	if onp := getenv("OLLAMA_NUM_PARALLEL"); onp != "" {
 		val, err := strconv.Atoi(onp)
 		val, err := strconv.Atoi(onp)
 		if err != nil {
 		if err != nil {
 			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
 			slog.Error("invalid setting, ignoring", "OLLAMA_NUM_PARALLEL", onp, "error", err)
@@ -269,24 +265,7 @@ func LoadConfig() {
 		}
 		}
 	}
 	}
 
 
-	if nohistory := clean("OLLAMA_NOHISTORY"); nohistory != "" {
-		NoHistory = true
-	}
-
-	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
-		s, err := strconv.ParseBool(spread)
-		if err == nil {
-			SchedSpread = s
-		} else {
-			SchedSpread = true
-		}
-	}
-
-	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
-		NoPrune = true
-	}
-
-	maxRunners := clean("OLLAMA_MAX_LOADED_MODELS")
+	maxRunners := getenv("OLLAMA_MAX_LOADED_MODELS")
 	if maxRunners != "" {
 	if maxRunners != "" {
 		m, err := strconv.Atoi(maxRunners)
 		m, err := strconv.Atoi(maxRunners)
 		if err != nil {
 		if err != nil {
@@ -305,20 +284,16 @@ func LoadConfig() {
 		}
 		}
 	}
 	}
 
 
-	ka := clean("OLLAMA_KEEP_ALIVE")
+	ka := getenv("OLLAMA_KEEP_ALIVE")
 	if ka != "" {
 	if ka != "" {
 		loadKeepAlive(ka)
 		loadKeepAlive(ka)
 	}
 	}
 
 
-	if set, err := strconv.ParseBool(clean("OLLAMA_INTEL_GPU")); err == nil {
-		IntelGpu = set
-	}
-
-	CudaVisibleDevices = clean("CUDA_VISIBLE_DEVICES")
-	HipVisibleDevices = clean("HIP_VISIBLE_DEVICES")
-	RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
-	GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
-	HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
+	CudaVisibleDevices = getenv("CUDA_VISIBLE_DEVICES")
+	HipVisibleDevices = getenv("HIP_VISIBLE_DEVICES")
+	RocrVisibleDevices = getenv("ROCR_VISIBLE_DEVICES")
+	GpuDeviceOrdinal = getenv("GPU_DEVICE_ORDINAL")
+	HsaOverrideGfxVersion = getenv("HSA_OVERRIDE_GFX_VERSION")
 }
 }
 
 
 func loadKeepAlive(ka string) {
 func loadKeepAlive(ka string) {

+ 26 - 2
envconfig/config_test.go

@@ -20,8 +20,8 @@ func TestSmoke(t *testing.T) {
 	require.True(t, Debug())
 	require.True(t, Debug())
 
 
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
 	t.Setenv("OLLAMA_FLASH_ATTENTION", "1")
-	LoadConfig()
-	require.True(t, FlashAttention)
+	require.True(t, FlashAttention())
+
 	t.Setenv("OLLAMA_KEEP_ALIVE", "")
 	t.Setenv("OLLAMA_KEEP_ALIVE", "")
 	LoadConfig()
 	LoadConfig()
 	require.Equal(t, 5*time.Minute, KeepAlive)
 	require.Equal(t, 5*time.Minute, KeepAlive)
@@ -162,3 +162,27 @@ func TestOrigins(t *testing.T) {
 		})
 		})
 	}
 	}
 }
 }
+
+func TestBool(t *testing.T) {
+	cases := map[string]struct {
+		value  string
+		expect bool
+	}{
+		"empty":     {"", false},
+		"true":      {"true", true},
+		"false":     {"false", false},
+		"1":         {"1", true},
+		"0":         {"0", false},
+		"random":    {"random", true},
+		"something": {"something", true},
+	}
+
+	for name, tt := range cases {
+		t.Run(name, func(t *testing.T) {
+			t.Setenv("OLLAMA_BOOL", tt.value)
+			if b := Bool("OLLAMA_BOOL"); b() != tt.expect {
+				t.Errorf("%s: expected %t, got %t", name, tt.expect, b())
+			}
+		})
+	}
+}

+ 1 - 1
gpu/gpu.go

@@ -302,7 +302,7 @@ func GetGPUInfo() GpuInfoList {
 		}
 		}
 
 
 		// Intel
 		// Intel
-		if envconfig.IntelGpu {
+		if envconfig.IntelGPU() {
 			oHandles = initOneAPIHandles()
 			oHandles = initOneAPIHandles()
 			// On windows we bundle the oneapi library one level above the runner dir
 			// On windows we bundle the oneapi library one level above the runner dir
 			depPath = ""
 			depPath = ""

+ 1 - 1
llm/server.go

@@ -221,7 +221,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 		params = append(params, "--memory-f32")
 	}
 	}
 
 
-	flashAttnEnabled := envconfig.FlashAttention
+	flashAttnEnabled := envconfig.FlashAttention()
 
 
 	for _, g := range gpus {
 	for _, g := range gpus {
 		// only cuda (compute capability 7+) and metal support flash attention
 		// only cuda (compute capability 7+) and metal support flash attention

+ 2 - 2
server/images.go

@@ -644,7 +644,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
 		return err
 		return err
 	}
 	}
 
 
-	if !envconfig.NoPrune && old != nil {
+	if !envconfig.NoPrune() && old != nil {
 		if err := old.RemoveLayers(); err != nil {
 		if err := old.RemoveLayers(); err != nil {
 			return err
 			return err
 		}
 		}
@@ -883,7 +883,7 @@ func PullModel(ctx context.Context, name string, regOpts *registryOptions, fn fu
 	// build deleteMap to prune unused layers
 	// build deleteMap to prune unused layers
 	deleteMap := make(map[string]struct{})
 	deleteMap := make(map[string]struct{})
 
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		manifest, _, err = GetManifest(mp)
 		manifest, _, err = GetManifest(mp)
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 		if err != nil && !errors.Is(err, os.ErrNotExist) {
 			return err
 			return err

+ 1 - 1
server/routes.go

@@ -1121,7 +1121,7 @@ func Serve(ln net.Listener) error {
 		return err
 		return err
 	}
 	}
 
 
-	if !envconfig.NoPrune {
+	if !envconfig.NoPrune() {
 		// clean up unused layers and manifests
 		// clean up unused layers and manifests
 		if err := PruneLayers(); err != nil {
 		if err := PruneLayers(); err != nil {
 			return err
 			return err

+ 1 - 1
server/sched.go

@@ -695,7 +695,7 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList, numP
 		// First attempt to fit the model into a single GPU
 		// First attempt to fit the model into a single GPU
 		for _, p := range numParallelToTry {
 		for _, p := range numParallelToTry {
 			req.opts.NumCtx = req.origNumCtx * p
 			req.opts.NumCtx = req.origNumCtx * p
-			if !envconfig.SchedSpread {
+			if !envconfig.SchedSpread() {
 				for _, g := range sgl {
 				for _, g := range sgl {
 					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 					if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 						slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))