|
@@ -833,10 +833,21 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+type multiLPath []string
|
|
|
+
|
|
|
+func (m *multiLPath) Set(value string) error {
|
|
|
+ *m = append(*m, value)
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func (m *multiLPath) String() string {
|
|
|
+ return strings.Join(*m, ", ")
|
|
|
+}
|
|
|
+
|
|
|
func (s *Server) loadModel(
|
|
|
params llama.ModelParams,
|
|
|
mpath string,
|
|
|
- lpath string,
|
|
|
+ lpath multiLPath,
|
|
|
ppath string,
|
|
|
kvSize int,
|
|
|
flashAttention bool,
|
|
@@ -857,10 +868,12 @@ func (s *Server) loadModel(
|
|
|
panic(err)
|
|
|
}
|
|
|
|
|
|
- if lpath != "" {
|
|
|
- err := s.model.ApplyLoraFromFile(s.lc, lpath, 1.0, threads)
|
|
|
- if err != nil {
|
|
|
- panic(err)
|
|
|
+ if lpath.String() != "" {
|
|
|
+ for _, path := range lpath {
|
|
|
+ err := s.model.ApplyLoraFromFile(s.lc, path, 1.0, threads)
|
|
|
+ if err != nil {
|
|
|
+ panic(err)
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -890,7 +903,6 @@ func main() {
|
|
|
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
|
|
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
|
|
kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
|
|
- lpath := flag.String("lora", "", "Path to lora layer file")
|
|
|
port := flag.Int("port", 8080, "Port to expose the server on")
|
|
|
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
|
|
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
|
@@ -900,6 +912,9 @@ func main() {
|
|
|
multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
|
|
|
requirements := flag.Bool("requirements", false, "print json requirement information")
|
|
|
|
|
|
+ var lpaths multiLPath
|
|
|
+ flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
|
|
|
+
|
|
|
flag.Parse()
|
|
|
if *requirements {
|
|
|
printRequirements(os.Stdout)
|
|
@@ -946,7 +961,7 @@ func main() {
|
|
|
params := llama.ModelParams{
|
|
|
NumGpuLayers: *nGpuLayers,
|
|
|
MainGpu: *mainGpu,
|
|
|
- UseMmap: !*noMmap && *lpath == "",
|
|
|
+ UseMmap: !*noMmap && lpaths.String() == "",
|
|
|
UseMlock: *mlock,
|
|
|
TensorSplit: tensorSplitFloats,
|
|
|
Progress: func(progress float32) {
|
|
@@ -955,7 +970,7 @@ func main() {
|
|
|
}
|
|
|
|
|
|
server.ready.Add(1)
|
|
|
- go server.loadModel(params, *mpath, *lpath, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
|
|
|
+ go server.loadModel(params, *mpath, lpaths, *ppath, *kvSize, *flashAttention, *threads, *multiUserCache)
|
|
|
|
|
|
server.cond = sync.NewCond(&server.mu)
|
|
|
|