преди 1 година · 942c979232
--- a/server/sched.go
+++ b/server/sched.go
@@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
 
				 
			
 
				 // context must be canceled to decrement ref count and release the runner
			
 
				 func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
			
 
				+	// allocate a large enough kv cache for all parallel requests
			
 
				+	opts.NumCtx = opts.NumCtx * numParallel
			
 
				+
			
 
				 	req := &LlmRequest{
			
 
				 		ctx:             c,
			
 
				 		model:           model,
			
@@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
 
				 		successCh:       make(chan *runnerRef),
			
 
				 		errCh:           make(chan error, 1),
			
 
				 	}
			
 
				-	// context split across parallel threads
			
 
				-	opts.NumCtx = opts.NumCtx * numParallel
			
 
				+
			
 
				 	select {
			
 
				 	case s.pendingReqCh <- req:
			
 
				 	default: