浏览代码

disable gpu for certain model architectures and fix divide-by-zero on memory estimation

Jeffrey Morgan 1 年之前
父节点
当前提交
f9cd55c70b
共有 1 个文件被更改,包括 12 次插入4 次删除
  1. 12 4
      llm/llm.go

+ 12 - 4
llm/llm.go

@@ -6,6 +6,7 @@ import (
 	"log/slog"
 	"log/slog"
 	"os"
 	"os"
 	"runtime"
 	"runtime"
+	"slices"
 
 
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/api"
 	"github.com/jmorganca/ollama/gpu"
 	"github.com/jmorganca/ollama/gpu"
@@ -19,6 +20,10 @@ type LLM interface {
 	Close()
 	Close()
 }
 }
 
 
+var cpuOnlyFamilies = []string{
+	"mamba",
+}
+
 func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 		return nil, err
@@ -48,13 +53,18 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 	size := ggml.Size
 	size := ggml.Size
 
 
 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
 	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(max(ggml.NumHead(), 1))
 
 
 	// this amount is the overhead + tensors in memory
 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.NumGQA()) * kv / 6
 	graph := int64(ggml.NumGQA()) * kv / 6
 
 
+	// certain model architectures don't support gpu inference yet
+	if slices.Contains(cpuOnlyFamilies, ggml.ModelFamily()) {
+		opts.NumGPU = 0
+	}
+
 	info := gpu.GetGPUInfo()
 	info := gpu.GetGPUInfo()
 	switch runtime.GOOS {
 	switch runtime.GOOS {
 	case "darwin":
 	case "darwin":
@@ -63,9 +73,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 		}
 		}
 
 
 		if size+kv+graph > vram {
 		if size+kv+graph > vram {
-			slog.Info("not enough vram available, falling back to CPU only")
-			info.Library = "cpu"
-			info.Variant = gpu.GetCPUVariant()
+			slog.Info("not enough vram available, setting num_gpu=0")
 			opts.NumGPU = 0
 			opts.NumGPU = 0
 			break
 			break
 		}
 		}