Browse Source

Support Multiple LoRa Adapters (#7667)

Closes #7627
ItzCrazyKns 5 months ago
parent
commit
e3936d4fb3
2 changed files with 26 additions and 14 deletions
  1. 23 8
      llama/runner/runner.go
  2. 3 6
      llm/server.go

+ 23 - 8
llama/runner/runner.go

@@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
 	}
 	}
 }
 }
 
 
+type multiLPath []string
+
+func (m *multiLPath) Set(value string) error {
+	*m = append(*m, value)
+	return nil
+}
+
+func (m *multiLPath) String() string {
+	return strings.Join(*m, ", ")
+}
+
 func (s *Server) loadModel(
 func (s *Server) loadModel(
 	params llama.ModelParams,
 	params llama.ModelParams,
 	mpath string,
 	mpath string,
-	lpath string,
+	lpath multiLPath,
 	ppath string,
 	ppath string,
 	kvSize int,
 	kvSize int,
 	flashAttention bool,
 	flashAttention bool,
@@ -857,10 +868,12 @@ func (s *Server) loadModel(
 		panic(err)
 		panic(err)
 	}
 	}
 
 
-	if lpath != "" {
-		err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
-		if err != nil {
-			panic(err)
+	if lpath.String() != "" {
+		for _, path := range lpath {
+			err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
+			if err != nil {
+				panic(err)
+			}
 		}
 		}
 	}
 	}
 
 
@@ -890,7 +903,6 @@ func main() {
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
 	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
 	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
 	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
-	lpath := flag.String("lora", "", "Path to lora layer file")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	port := flag.Int("port", 8080, "Port to expose the server on")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
 	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
@@ -900,6 +912,9 @@ func main() {
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 	requirements := flag.Bool("requirements", false, "print json requirement information")
 	requirements := flag.Bool("requirements", false, "print json requirement information")
 
 
+	var lpaths multiLPath
+	flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
+
 	flag.Parse()
 	flag.Parse()
 	if *requirements {
 	if *requirements {
 		printRequirements(os.Stdout)
 		printRequirements(os.Stdout)
@@ -946,7 +961,7 @@ func main() {
 	params := llama.ModelParams{
 	params := llama.ModelParams{
 		NumGpuLayers: *nGpuLayers,
 		NumGpuLayers: *nGpuLayers,
 		MainGpu:      *mainGpu,
 		MainGpu:      *mainGpu,
-		UseMmap:      !*noMmap && *lpath == "",
+		UseMmap:      !*noMmap && lpaths.String() == "",
 		UseMlock:     *mlock,
 		UseMlock:     *mlock,
 		TensorSplit:  tensorSplitFloats,
 		TensorSplit:  tensorSplitFloats,
 		Progress: func(progress float32) {
 		Progress: func(progress float32) {
@@ -955,7 +970,7 @@ func main() {
 	}
 	}
 
 
 	server.ready.Add(1)
 	server.ready.Add(1)
-	go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
+	go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
 
 
 	server.cond = sync.NewCond(&server.mu)
 	server.cond = sync.NewCond(&server.mu)
 
 

+ 3 - 6
llm/server.go

@@ -144,10 +144,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	// Loop through potential servers
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 	finalErr := errors.New("no suitable llama servers found")
 
 
-	if len(adapters) > 1 {
-		return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
-	}
-
 	rDir, err := runners.Refresh(build.EmbedFS)
 	rDir, err := runners.Refresh(build.EmbedFS)
 	if err != nil {
 	if err != nil {
 		return nil, err
 		return nil, err
@@ -201,8 +197,9 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 	}
 
 
 	if len(adapters) > 0 {
 	if len(adapters) > 0 {
-		// TODO: applying multiple adapters is not supported by the llama.cpp server yet
-		params = append(params, "--lora", adapters[0])
+		for _, adapter := range adapters {
+			params = append(params, "--lora", adapter)
+		}
 	}
 	}
 
 
 	if len(projectors) > 0 {
 	if len(projectors) > 0 {