8 months ago · 8e1554c91d
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -198,8 +198,7 @@ func incompleteUnicode(token string) bool {
 
				 }
			
 
				 
			
 
				 func (s *Server) run(ctx context.Context) {
			
 
				-	// TODO - should this be n_ctx / parallel like the old server.cpp setup?
			
 
				-	batch := llama.NewBatch(s.batchSize, 0, s.parallel)
			
 
				+	batch := llama.NewBatch(s.batchSize*len(s.seqs), 0, len(s.seqs))
			
 
				 	defer batch.Free()
			
 
				 
			
 
				 	// build up stop sequences as we recognize them