5 月之前 · 17b386a891
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -20,6 +20,8 @@ import (
 
				 	"time"
			
 
				 	"unicode/utf8"
			
 
				 
			
 
				+	"golang.org/x/sync/semaphore"
			
 
				+
			
 
				 	"github.com/ollama/ollama/api"
			
 
				 	"github.com/ollama/ollama/llama"
			
 
				 )
			
@@ -203,38 +205,51 @@ func (s *Server) inputs(prompt string, images []ImageData) ([]input, error) {
 
				 }
			
 
				 
			
 
				 type Server struct {
			
 
				+	// is the server ready to process requests?
			
 
				+	// protects access to model and image
			
 
				+	ready sync.WaitGroup
			
 
				+
			
 
				+	// loaded model
			
 
				 	model *llama.Model
			
 
				-	lc    *llama.Context
			
 
				 
			
 
				-	// required for image embeddings
			
 
				+	// image model context for multi-modal models
			
 
				 	image *ImageContext
			
 
				 
			
 
				+	// status for external health reporting - loading, ready to serve, etc.
			
 
				+	status ServerStatus
			
 
				+
			
 
				+	// current progress on loading the model
			
 
				+	progress float32
			
 
				+
			
 
				+	// number of simultaneous requests to handle
			
 
				+	parallel int
			
 
				+
			
 
				+	// maximum number of elements in a batch (per sequence)
			
 
				 	// TODO (jmorganca): make this n_batch
			
 
				 	batchSize int
			
 
				 
			
 
				-	// parallel is the number of parallel requests to handle
			
 
				-	parallel int
			
 
				+	// protects access to everything below this line
			
 
				+	// this is context state needed for decoding
			
 
				+	mu sync.Mutex
			
 
				+
			
 
				+	// indicates that data is ready for processing
			
 
				+	cond *sync.Cond
			
 
				+
			
 
				+	// decoding state
			
 
				+	lc *llama.Context
			
 
				 
			
 
				-	// seqs is the list of parallel sequences being evaluated
			
 
				-	// TODO (jmorganca): this can probably be moved into run()
			
 
				+	// the list of simultaneous sequences being evaluated
			
 
				 	seqs []*Sequence
			
 
				 
			
 
				+	// seqs can have a maximum of parallel entries, which
			
 
				+	// is enfoced by seqSem
			
 
				+	seqsSem *semaphore.Weighted
			
 
				+
			
 
				 	// KV cache
			
 
				 	cache *InputCache
			
 
				 
			
 
				 	// next sequence for prompt processing to avoid starvation
			
 
				 	nextSeq int
			
 
				-
			
 
				-	// is the server ready to process requests?
			
 
				-	ready sync.WaitGroup
			
 
				-
			
 
				-	mu sync.Mutex
			
 
				-
			
 
				-	cond *sync.Cond
			
 
				-
			
 
				-	progress float32
			
 
				-
			
 
				-	status ServerStatus
			
 
				 }
			
 
				 
			
 
				 func (s *Server) allNil() bool {
			
@@ -616,8 +631,13 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				-	// TODO (jmorganca): add to sequence queue instead of
			
 
				-	// failing if a slot isn't available
			
 
				+	// Ensure that a place to put the sequence is available
			
 
				+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
			
 
				+		slog.Error("Failed to acquire semaphore", "error", err)
			
 
				+		return
			
 
				+	}
			
 
				+	defer s.seqsSem.Release(1)
			
 
				+
			
 
				 	s.mu.Lock()
			
 
				 	for i, sq := range s.seqs {
			
 
				 		if sq == nil {
			
@@ -700,7 +720,13 @@ func (s *Server) embeddings(w http.ResponseWriter, r *http.Request) {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				-	// TODO (jessegross): Wait for a free slot instead of failing and blocking forever
			
 
				+	// Ensure that a place to put the sequence is available
			
 
				+	if err := s.seqsSem.Acquire(r.Context(), 1); err != nil {
			
 
				+		slog.Error("Failed to acquire semaphore", "error", err)
			
 
				+		return
			
 
				+	}
			
 
				+	defer s.seqsSem.Release(1)
			
 
				+
			
 
				 	s.mu.Lock()
			
 
				 	for i, sq := range s.seqs {
			
 
				 		if sq == nil {
			
@@ -855,6 +881,7 @@ func main() {
 
				 		batchSize: *batchSize,
			
 
				 		parallel:  *parallel,
			
 
				 		seqs:      make([]*Sequence, *parallel),
			
 
				+		seqsSem:   semaphore.NewWeighted(int64(*parallel)),
			
 
				 		status:    ServerStatusLoadingModel,
			
 
				 	}