|
@@ -53,6 +53,7 @@ type llmServer struct {
|
|
estimatedTotal uint64 // Total size of model
|
|
estimatedTotal uint64 // Total size of model
|
|
totalLayers uint64
|
|
totalLayers uint64
|
|
gpuCount int
|
|
gpuCount int
|
|
|
|
+ loadDuration time.Duration // Record how long it took the model to load
|
|
|
|
|
|
sem *semaphore.Weighted
|
|
sem *semaphore.Weighted
|
|
}
|
|
}
|
|
@@ -291,6 +292,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
|
totalLayers: ggml.KV().BlockCount() + 1,
|
|
totalLayers: ggml.KV().BlockCount() + 1,
|
|
gpuCount: gpuCount,
|
|
gpuCount: gpuCount,
|
|
|
|
+ done: make(chan error, 1),
|
|
}
|
|
}
|
|
|
|
|
|
s.cmd.Env = os.Environ()
|
|
s.cmd.Env = os.Environ()
|
|
@@ -339,6 +341,11 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
continue
|
|
continue
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // reap subprocess when it exits
|
|
|
|
+ go func() {
|
|
|
|
+ s.done <- s.cmd.Wait()
|
|
|
|
+ }()
|
|
|
|
+
|
|
return s, nil
|
|
return s, nil
|
|
}
|
|
}
|
|
|
|
|
|
@@ -483,13 +490,11 @@ func (s *llmServer) Ping(ctx context.Context) error {
|
|
|
|
|
|
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|
func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|
start := time.Now()
|
|
start := time.Now()
|
|
- // TODO we need to wire up a better way to detect hangs during model load and startup of the server
|
|
|
|
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
|
expiresAt := time.Now().Add(10 * time.Minute) // be generous with timeout, large models can take a while to load
|
|
- ticker := time.NewTicker(50 * time.Millisecond)
|
|
|
|
- defer ticker.Stop()
|
|
|
|
|
|
|
|
slog.Info("waiting for llama runner to start responding")
|
|
slog.Info("waiting for llama runner to start responding")
|
|
var lastStatus ServerStatus = -1
|
|
var lastStatus ServerStatus = -1
|
|
|
|
+
|
|
for {
|
|
for {
|
|
select {
|
|
select {
|
|
case <-ctx.Done():
|
|
case <-ctx.Done():
|
|
@@ -501,41 +506,39 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
|
msg = s.status.LastErrMsg
|
|
msg = s.status.LastErrMsg
|
|
}
|
|
}
|
|
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
|
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
|
- case <-ticker.C:
|
|
|
|
- if time.Now().After(expiresAt) {
|
|
|
|
- // timeout
|
|
|
|
- msg := ""
|
|
|
|
- if s.status != nil && s.status.LastErrMsg != "" {
|
|
|
|
- msg = s.status.LastErrMsg
|
|
|
|
- }
|
|
|
|
- return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
|
|
|
|
- }
|
|
|
|
- if s.cmd.ProcessState != nil {
|
|
|
|
- msg := ""
|
|
|
|
- if s.status != nil && s.status.LastErrMsg != "" {
|
|
|
|
- msg = s.status.LastErrMsg
|
|
|
|
- }
|
|
|
|
- return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- c, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
|
|
|
- defer cancel()
|
|
|
|
- status, err := s.getServerStatus(c)
|
|
|
|
- if err != nil && lastStatus != status {
|
|
|
|
- slog.Debug("server not yet available", "error", err)
|
|
|
|
- lastStatus = status
|
|
|
|
- continue
|
|
|
|
|
|
+ default:
|
|
|
|
+ }
|
|
|
|
+ if time.Now().After(expiresAt) {
|
|
|
|
+ // timeout
|
|
|
|
+ msg := ""
|
|
|
|
+ if s.status != nil && s.status.LastErrMsg != "" {
|
|
|
|
+ msg = s.status.LastErrMsg
|
|
}
|
|
}
|
|
-
|
|
|
|
- switch status {
|
|
|
|
- case ServerStatusLoadingModel:
|
|
|
|
- // TODO - this state never seems to happen with the current server.cpp code (bug?)
|
|
|
|
- // it doesn't respond to the health endpoint until after the model is loaded
|
|
|
|
- slog.Debug("loading model")
|
|
|
|
- case ServerStatusReady:
|
|
|
|
- slog.Debug(fmt.Sprintf("llama runner started in %f seconds", time.Since(start).Seconds()))
|
|
|
|
- return nil
|
|
|
|
|
|
+ return fmt.Errorf("timed out waiting for llama runner to start: %s", msg)
|
|
|
|
+ }
|
|
|
|
+ if s.cmd.ProcessState != nil {
|
|
|
|
+ msg := ""
|
|
|
|
+ if s.status != nil && s.status.LastErrMsg != "" {
|
|
|
|
+ msg = s.status.LastErrMsg
|
|
}
|
|
}
|
|
|
|
+ return fmt.Errorf("llama runner process no longer running: %d %s", s.cmd.ProcessState.ExitCode(), msg)
|
|
|
|
+ }
|
|
|
|
+ ctx, cancel := context.WithTimeout(ctx, 200*time.Millisecond)
|
|
|
|
+ defer cancel()
|
|
|
|
+ status, _ := s.getServerStatus(ctx)
|
|
|
|
+ if lastStatus != status && status != ServerStatusReady {
|
|
|
|
+ // Only log on status changes
|
|
|
|
+ slog.Info("waiting for server to become available", "status", status.ToString())
|
|
|
|
+ }
|
|
|
|
+ switch status {
|
|
|
|
+ case ServerStatusReady:
|
|
|
|
+ s.loadDuration = time.Since(start)
|
|
|
|
+ slog.Info(fmt.Sprintf("llama runner started in %0.2f seconds", s.loadDuration.Seconds()))
|
|
|
|
+ return nil
|
|
|
|
+ default:
|
|
|
|
+ lastStatus = status
|
|
|
|
+ time.Sleep(time.Millisecond * 250)
|
|
|
|
+ continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -943,8 +946,11 @@ func (s *llmServer) Close() error {
|
|
if err := s.cmd.Process.Kill(); err != nil {
|
|
if err := s.cmd.Process.Kill(); err != nil {
|
|
return err
|
|
return err
|
|
}
|
|
}
|
|
-
|
|
|
|
- _ = s.cmd.Wait()
|
|
|
|
|
|
+ // if ProcessState is already populated, Wait already completed, no need to wait again
|
|
|
|
+ if s.cmd.ProcessState == nil {
|
|
|
|
+ slog.Debug("waiting for llama server to exit")
|
|
|
|
+ <-s.done
|
|
|
|
+ }
|
|
|
|
|
|
slog.Debug("llama server stopped")
|
|
slog.Debug("llama server stopped")
|
|
}
|
|
}
|