|
@@ -93,6 +93,9 @@ func InitScheduler(ctx context.Context) *Scheduler {
|
|
|
|
|
|
// context must be canceled to decrement ref count and release the runner
|
|
|
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration time.Duration) (chan *runnerRef, chan error) {
|
|
|
+ // allocate a large enough kv cache for all parallel requests
|
|
|
+ opts.NumCtx = opts.NumCtx * numParallel
|
|
|
+
|
|
|
req := &LlmRequest{
|
|
|
ctx: c,
|
|
|
model: model,
|
|
@@ -101,8 +104,7 @@ func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options,
|
|
|
successCh: make(chan *runnerRef),
|
|
|
errCh: make(chan error, 1),
|
|
|
}
|
|
|
- // context split across parallel threads
|
|
|
- opts.NumCtx = opts.NumCtx * numParallel
|
|
|
+
|
|
|
select {
|
|
|
case s.pendingReqCh <- req:
|
|
|
default:
|