|
@@ -44,11 +44,12 @@ type LlamaServer interface {
|
|
|
|
|
|
// llmServer is an instance of the llama.cpp server
|
|
|
type llmServer struct {
|
|
|
- port int
|
|
|
- cmd *exec.Cmd
|
|
|
- done chan error // Channel to signal when the process exits
|
|
|
- status *StatusWriter
|
|
|
- options api.Options
|
|
|
+ port int
|
|
|
+ cmd *exec.Cmd
|
|
|
+ done chan error // Channel to signal when the process exits
|
|
|
+ status *StatusWriter
|
|
|
+ options api.Options
|
|
|
+ numParallel int
|
|
|
|
|
|
estimate MemoryEstimate
|
|
|
totalLayers uint64
|
|
@@ -343,6 +344,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|
|
status: NewStatusWriter(os.Stderr),
|
|
|
options: opts,
|
|
|
estimate: estimate,
|
|
|
+ numParallel: numParallel,
|
|
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
|
|
totalLayers: ggml.KV().BlockCount() + 1,
|
|
|
gpus: gpus,
|
|
@@ -890,11 +892,14 @@ type EmbedResponse struct {
|
|
|
}
|
|
|
|
|
|
func (s *llmServer) Embed(ctx context.Context, input []string) (*EmbedResponse, error) {
|
|
|
- if err := s.sem.Acquire(ctx, 1); err != nil {
|
|
|
+ // each input will use a slot, so we need to acquire the semaphore for
|
|
|
+ // the number of inputs up to numParallel
|
|
|
+ slots := int64(min(len(input), s.numParallel))
|
|
|
+ if err := s.sem.Acquire(ctx, slots); err != nil {
|
|
|
slog.Error("Failed to acquire semaphore", "error", err)
|
|
|
return nil, err
|
|
|
}
|
|
|
- defer s.sem.Release(1)
|
|
|
+ defer s.sem.Release(slots)
|
|
|
|
|
|
// Make sure the server is ready
|
|
|
status, err := s.getServerStatusRetry(ctx)
|