|
@@ -68,6 +68,18 @@ var loaded struct {
|
|
|
|
|
|
var defaultSessionDuration = 5 * time.Minute
|
|
var defaultSessionDuration = 5 * time.Minute
|
|
|
|
|
|
|
|
+func unload() {
|
|
|
|
+ if loaded.llama != nil {
|
|
|
|
+ loaded.llama.Close()
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ loaded.llama = nil
|
|
|
|
+ loaded.model = ""
|
|
|
|
+ loaded.adapters = nil
|
|
|
|
+ loaded.projectors = nil
|
|
|
|
+ loaded.Options = nil
|
|
|
|
+}
|
|
|
|
+
|
|
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
|
// load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
|
|
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
|
|
func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
|
|
ctx, cancel := context.WithTimeout(c, 10*time.Second)
|
|
ctx, cancel := context.WithTimeout(c, 10*time.Second)
|
|
@@ -83,12 +95,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
|
|
if needLoad {
|
|
if needLoad {
|
|
if loaded.llama != nil {
|
|
if loaded.llama != nil {
|
|
slog.Info("changing loaded model")
|
|
slog.Info("changing loaded model")
|
|
- loaded.llama.Close()
|
|
|
|
- loaded.llama = nil
|
|
|
|
- loaded.model = ""
|
|
|
|
- loaded.adapters = nil
|
|
|
|
- loaded.projectors = nil
|
|
|
|
- loaded.Options = nil
|
|
|
|
|
|
+ unload()
|
|
}
|
|
}
|
|
|
|
|
|
llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
|
|
llama, err := llm.NewLlamaServer(model.ModelPath, model.AdapterPaths, model.ProjectorPaths, opts)
|
|
@@ -108,22 +115,19 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D
|
|
loaded.projectors = model.ProjectorPaths
|
|
loaded.projectors = model.ProjectorPaths
|
|
loaded.llama = llama
|
|
loaded.llama = llama
|
|
loaded.Options = &opts
|
|
loaded.Options = &opts
|
|
|
|
+
|
|
|
|
+ if err = llama.WaitUntilRunning(); err != nil {
|
|
|
|
+ slog.Error("error loading llama server", "error", err)
|
|
|
|
+ unload()
|
|
|
|
+ return err
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
if loaded.expireTimer == nil {
|
|
if loaded.expireTimer == nil {
|
|
loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
|
|
loaded.expireTimer = time.AfterFunc(sessionDuration, func() {
|
|
loaded.mu.Lock()
|
|
loaded.mu.Lock()
|
|
defer loaded.mu.Unlock()
|
|
defer loaded.mu.Unlock()
|
|
-
|
|
|
|
- if loaded.llama != nil {
|
|
|
|
- loaded.llama.Close()
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- loaded.llama = nil
|
|
|
|
- loaded.model = ""
|
|
|
|
- loaded.adapters = nil
|
|
|
|
- loaded.projectors = nil
|
|
|
|
- loaded.Options = nil
|
|
|
|
|
|
+ unload()
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
|
|
@@ -1146,9 +1150,7 @@ func Serve(ln net.Listener) error {
|
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
|
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
|
|
go func() {
|
|
go func() {
|
|
<-signals
|
|
<-signals
|
|
- if loaded.llama != nil {
|
|
|
|
- loaded.llama.Close()
|
|
|
|
- }
|
|
|
|
|
|
+ unload()
|
|
gpu.Cleanup()
|
|
gpu.Cleanup()
|
|
os.Exit(0)
|
|
os.Exit(0)
|
|
}()
|
|
}()
|