10 months ago · a562b9069f
--- a/build.log
+++ b/build.log
--- a/llm/server.go
+++ b/llm/server.go
@@ -82,7 +82,7 @@ func LoadModel(model string, maxArraySize int) (*GGML, error) {
 
				 
			
 
				 // NewLlamaServer will run a server for the given GPUs
			
 
				 // The gpu list must be a single family.
			
 
				-func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options) (LlamaServer, error) {
			
 
				+func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, projectors []string, opts api.Options, numParallel int) (LlamaServer, error) {
			
 
				 	var err error
			
 
				 	var cpuRunner string
			
 
				 	var estimate MemoryEstimate
			
@@ -218,8 +218,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 
			
 
				 	// Windows CUDA should not use mmap for best performance
			
 
				 	// Linux  with a model larger than free space, mmap leads to thrashing
			
 
				+	// For CPU loads we want the memory to be allocated, not FS cache
			
 
				 	if (runtime.GOOS == "windows" && gpus[0].Library == "cuda" && opts.UseMMap == api.TriStateUndefined) ||
			
 
				 		(runtime.GOOS == "linux" && systemFreeMemory < estimate.TotalSize && opts.UseMMap == api.TriStateUndefined) ||
			
 
				+		(gpus[0].Library == "cpu" && opts.UseMMap == api.TriStateUndefined) ||
			
 
				 		opts.UseMMap == api.TriStateFalse {
			
 
				 		params = append(params, "--no-mmap")
			
 
				 	}
			
@@ -232,15 +234,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 
				 		params = append(params, "--numa")
			
 
				 	}
			
 
				 
			
 
				-	numParallel := envconfig.NumParallel
			
 
				-
			
 
				-	// TODO (jmorganca): multimodal models don't support parallel yet
			
 
				-	// see https://github.com/ollama/ollama/issues/4165
			
 
				-	if len(projectors) > 0 {
			
 
				-		numParallel = 1
			
 
				-		slog.Warn("multimodal models don't support parallel requests yet")
			
 
				-	}
			
 
				-
			
 
				 	params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
			
 
				 
			
 
				 	if estimate.TensorSplit != "" {
			
@@ -567,6 +560,9 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 
				 			if s.status != nil && s.status.LastErrMsg != "" {
			
 
				 				msg = s.status.LastErrMsg
			
 
				 			}
			
 
				+			if strings.Contains(msg, "unknown model") {
			
 
				+				return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade.")
			
 
				+			}
			
 
				 			return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
			
 
				 		default:
			
 
				 		}
			
--- a/llm/status.go
+++ b/llm/status.go
@@ -25,6 +25,7 @@ var errorPrefixes = []string{
 
				 	"CUDA error",
			
 
				 	"cudaMalloc failed",
			
 
				 	"\"ERR\"",
			
 
				+	"architecture",
			
 
				 }
			
 
				 
			
 
				 func (w *StatusWriter) Write(b []byte) (int, error) {
			
@@ -34,19 +35,6 @@ func (w *StatusWriter) Write(b []byte) (int, error) {
 
				 			errMsg = prefix + string(bytes.TrimSpace(after))
			
 
				 		}
			
 
				 	}
			
 
				-
			
 
				-	if bytes.Contains(b, []byte("unknown model architecture")) {
			
 
				-		if _, after, ok := bytes.Cut(b, []byte("architecture")); ok {
			
 
				-			errMsg = "error" + string(bytes.TrimSpace(after))
			
 
				-
			
 
				-			if before, _, ok := bytes.Cut(after, []byte("llama_load")); ok {
			
 
				-				errMsg = "error" + string(bytes.TrimSpace(before))
			
 
				-			}
			
 
				-
			
 
				-			errMsg = errMsg + "\nYour current version of Ollama doesn't support this model architecture. Consider upgrading."
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				 	if errMsg != "" {
			
 
				 		w.LastErrMsg = errMsg
			
 
				 	}